diff --git a/activation/impls/artifacts/benchmark/activation.jsonl b/activation/impls/artifacts/benchmark/activation.jsonl index 5cb1d2e7a3df4383b17555d3c5513bbb6d567a4e..93cac36bf4f689de57400a82e22b49cf0344ff7b 100644 --- a/activation/impls/artifacts/benchmark/activation.jsonl +++ b/activation/impls/artifacts/benchmark/activation.jsonl @@ -1,9 +1,9 @@ -{"ts": "2025-10-28T14:08:47Z", "run": "c851494cf96a4119be3e14911bb5592b", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02110099990204617, "p50": 0.022570000055566197, "p90": 0.02266100000269944, "mean": 0.022242599993660406, "iqr": 0.0007410000080199097, "raw_times": [0.022570000055566197, 0.022961000013310695, 0.02191999999467953, 0.02266100000269944, 0.02110099990204617], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.02889100005631917, "peak_bytes": 1966080, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} -{"ts": "2025-10-28T14:08:47Z", "run": "c851494cf96a4119be3e14911bb5592b", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02585100003216212, "p50": 0.02831100005096232, "p90": 0.02854100000604376, "mean": 0.02791500000967062, "iqr": 0.0013400000398178236, "raw_times": [0.02585100003216212, 0.02854100000604376, 0.02967099999295897, 0.02831100005096232, 0.027200999966225936], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031750999937685265, "peak_bytes": 2621440, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} -{"ts": "2025-10-28T14:08:47Z", "run": "c851494cf96a4119be3e14911bb5592b", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02804099995046272, "p50": 0.028271000019230996, "p90": 0.02853099999811093, "mean": 0.032097199982672464, "iqr": 0.0004900000476482091, "raw_times": [0.04760199999509496, 0.028271000019230996, 0.02853099999811093, 0.02804099995046272, 0.02804099995046272], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031132000003708526, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} -{"ts": "2025-10-28T14:08:47Z", "run": "c851494cf96a4119be3e14911bb5592b", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02513000004000787, "p50": 0.027131000024382956, "p90": 0.027909999971598154, "mean": 0.027204600019103964, "iqr": 0.0014589999182135216, "raw_times": [0.02513000004000787, 0.027131000024382956, 0.027909999971598154, 0.029401000006146205, 0.026451000053384632], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.030690999892613036, "peak_bytes": 3932160, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} -{"ts": "2025-10-28T14:08:47Z", "run": "c851494cf96a4119be3e14911bb5592b", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02570000003743189, "p50": 0.026741000056063058, "p90": 0.02731099993980024, "mean": 0.02703079999264446, "iqr": 0.0012099999366910197, "raw_times": [0.02570000003743189, 0.02731099993980024, 0.029300999926817894, 0.02610100000310922, 0.026741000056063058], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.030331000061778468, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} -{"ts": "2025-10-28T14:08:47Z", "run": "c851494cf96a4119be3e14911bb5592b", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.025050999965969822, "p50": 0.026220999984616356, "p90": 0.028031000056216726, "mean": 0.026778999995258346, "iqr": 0.0018400000953988638, "raw_times": [0.025050999965969822, 0.026190999960817862, 0.026220999984616356, 0.028031000056216726, 0.028401000008670962], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031100999990485434, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} -{"ts": "2025-10-28T14:08:47Z", "run": "c851494cf96a4119be3e14911bb5592b", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02494000000297092, "p50": 0.026971000011144497, "p90": 0.02789099994515709, "mean": 0.027030599972022173, "iqr": 0.0009699999736767495, "raw_times": [0.02494000000297092, 0.026971000011144497, 0.02789099994515709, 0.02842999992935802, 0.02692099997148034], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.029161000043131935, "peak_bytes": 7864320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} -{"ts": "2025-10-28T14:08:47Z", "run": "c851494cf96a4119be3e14911bb5592b", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.024340999971173005, "p50": 0.02594099998987076, "p90": 0.027440999929240206, "mean": 0.026286999968760938, "iqr": 0.0016499999446750735, "raw_times": [0.024340999971173005, 0.027920999968955584, 0.027440999929240206, 0.02594099998987076, 0.025790999984565133], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.02797100000861974, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} -{"ts": "2025-10-28T14:08:47Z", "run": "c851494cf96a4119be3e14911bb5592b", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.025551000021550863, "p50": 0.026880999939749017, "p90": 0.028271000019230996, "mean": 0.027656800011754967, "iqr": 0.002240999947389355, "raw_times": [0.025551000021550863, 0.026880999939749017, 0.02603000007184164, 0.03155100000640232, 0.028271000019230996], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.02960100005111599, "peak_bytes": 20971520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02251099999739381, "p50": 0.02324100000805629, "p90": 0.023539999972399528, "mean": 0.023146399996676337, "iqr": 0.0007499999696847226, "raw_times": [0.023539999972399528, 0.022790000002714805, 0.02324100000805629, 0.02365000000281725, 0.02251099999739381], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.029810000000907166, "peak_bytes": 1966080, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.027370999987397227, "p50": 0.028240999995432503, "p90": 0.028329999963716546, "mean": 0.02825879998908931, "iqr": 0.00023899997358967084, "raw_times": [0.028090999990126875, 0.028240999995432503, 0.028329999963716546, 0.029261000008773408, 0.027370999987397227], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03212000001440174, "peak_bytes": 2621440, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02641099996480989, "p50": 0.027520999992702855, "p90": 0.028440999983558868, "mean": 0.027734599996165343, "iqr": 0.001440999938040477, "raw_times": [0.02641099996480989, 0.028440999983558868, 0.029299999994236714, 0.027520999992702855, 0.02700000004551839], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.032080999972095015, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.026321000007101247, "p50": 0.02724099999795726, "p90": 0.028659999998126295, "mean": 0.02923079999845868, "iqr": 0.0014990000067882647, "raw_times": [0.026321000007101247, 0.03677099999777056, 0.028659999998126295, 0.02716099999133803, 0.02724099999795726], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031121000006351096, "peak_bytes": 3932160, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.025979999975334067, "p50": 0.028520999990178098, "p90": 0.028720999978304462, "mean": 0.027810800008865044, "iqr": 0.00169999992749581, "raw_times": [0.025979999975334067, 0.028811000049699942, 0.028520999990178098, 0.027021000050808652, 0.028720999978304462], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.02976100000751103, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.026359999992564553, "p50": 0.027051000017763727, "p90": 0.027101000000584463, "mean": 0.027004599996871548, "iqr": 0.00035100003970001126, "raw_times": [0.027101000000584463, 0.027051000017763727, 0.027761000012560544, 0.026749999960884452, 0.026359999992564553], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.029620999953294813, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02576100001761006, "p50": 0.027530000011211087, "p90": 0.02828099997032041, "mean": 0.0273743999969156, "iqr": 0.001340999972399004, "raw_times": [0.02576100001761006, 0.02828099997032041, 0.026939999997921404, 0.02835999998751504, 0.027530000011211087], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.030121000008875853, "peak_bytes": 7864320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.025459999960730784, "p50": 0.028590999988864496, "p90": 0.02870100001928222, "mean": 0.027812799999082927, "iqr": 0.00113999999484804, "raw_times": [0.025459999960730784, 0.02870100001928222, 0.028751000002102955, 0.02756100002443418, 0.028590999988864496], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.029950999987704563, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02632999996876606, "p50": 0.027500999976837193, "p90": 0.028640000039104052, "mean": 0.028318399995441723, "iqr": 0.0021100000253682083, "raw_times": [0.02632999996876606, 0.03259099997876547, 0.027500999976837193, 0.026530000013735844, 0.028640000039104052], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.029991000019435887, "peak_bytes": 20971520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} diff --git a/activation/impls/hf_kernels_swiglu.html b/activation/impls/hf_kernels_swiglu.html index 0333e61899bfbfb799696bb358236ac894538ab4..0ee10cb621cd4a8fa09e449aade63a5a1449d022 100644 --- a/activation/impls/hf_kernels_swiglu.html +++ b/activation/impls/hf_kernels_swiglu.html @@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: nv | 0.21s +Cell: nv | 0.26s | Raw @@ -3887,7 +3887,7 @@ Cell: nv | 0.21s
-
Tue Oct 28 14:07:54 2025       
+
Wed Oct 29 14:26:44 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -3896,7 +3896,7 @@ Cell: nv | 0.21s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   27C    P0             80W /  350W |       0MiB /  46068MiB |      1%      Default |
+| N/A   32C    P0            133W /  350W |       0MiB /  46068MiB |    100%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3920,7 +3920,7 @@ Cell: nv | 0.21s
 ▼ output
  ▶ uv-logs
  | 
-Cell: benchmark | 4.26s
+Cell: benchmark | 4.19s
  | 
 
 Raw
@@ -3976,17 +3976,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D768
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      70.944us      1745.67%      70.944us      70.944us             1  
-                                      hf_kernels_swiglu        10.31%     179.916us        99.57%       1.738ms       1.738ms       0.000us         0.00%       5.472us       5.472us             1  
-                      _activation_beeaae6::silu_and_mul         1.09%      18.951us        86.60%       1.512ms     503.911us       4.064us       100.00%       5.472us       1.824us             3  
-void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       4.064us       100.00%       4.064us       1.355us             3  
-                                Activity Buffer Request        83.12%       1.451ms        83.12%       1.451ms       1.451ms       1.408us        34.65%       1.408us       1.408us             1  
-                                            aten::empty         2.66%      46.432us         2.66%      46.432us      15.477us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         2.39%      41.801us         2.39%      41.801us      13.934us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.43%       7.500us         0.43%       7.500us       7.500us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      72.288us      1807.20%      72.288us      72.288us             1  
+                                      hf_kernels_swiglu        12.07%     211.387us        99.59%       1.744ms       1.744ms       0.000us         0.00%       5.376us       5.376us             1  
+                      _activation_beeaae6::silu_and_mul         1.10%      19.319us        84.87%       1.486ms     495.368us       4.000us       100.00%       5.376us       1.792us             3  
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       4.000us       100.00%       4.000us       1.333us             3  
+                                Activity Buffer Request        81.49%       1.427ms        81.49%       1.427ms       1.427ms       1.376us        34.40%       1.376us       1.376us             1  
+                                            aten::empty         2.64%      46.231us         2.64%      46.231us      15.410us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         2.28%      39.911us         2.28%      39.911us      13.304us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.41%       7.220us         0.41%       7.220us       7.220us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.746ms
-Self CUDA time total: 4.064us
+Self CPU time total: 1.751ms
+Self CUDA time total: 4.000us
 
 
 
@@ -3996,17 +3996,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      68.703us      1759.36%      68.703us      68.703us             1  
-                                      hf_kernels_swiglu         6.60%     109.215us        99.70%       1.650ms       1.650ms       0.000us         0.00%       5.217us       5.217us             1  
-                      _activation_beeaae6::silu_and_mul         1.44%      23.760us        91.91%       1.521ms     506.927us       3.905us       100.00%       5.217us       1.739us             3  
-void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       3.905us       100.00%       3.905us       1.302us             3  
-                                Activity Buffer Request        88.83%       1.470ms        88.83%       1.470ms       1.470ms       1.312us        33.60%       1.312us       1.312us             1  
-                                            aten::empty         1.19%      19.640us         1.19%      19.640us       6.547us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         1.65%      27.251us         1.65%      27.251us       9.084us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.30%       4.941us         0.30%       4.941us       4.941us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      62.686us      1579.79%      62.686us      62.686us             1  
+                                      hf_kernels_swiglu         6.72%     108.943us        99.67%       1.616ms       1.616ms       0.000us         0.00%       5.312us       5.312us             1  
+                      _activation_beeaae6::silu_and_mul         1.34%      21.721us        91.77%       1.488ms     495.875us       3.968us       100.00%       5.312us       1.771us             3  
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       3.968us       100.00%       3.968us       1.323us             3  
+                                Activity Buffer Request        88.82%       1.440ms        88.82%       1.440ms       1.440ms       1.344us        33.87%       1.344us       1.344us             1  
+                                            aten::empty         1.18%      19.150us         1.18%      19.150us       6.383us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.61%      26.150us         1.61%      26.150us       8.717us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.33%       5.310us         0.33%       5.310us       5.310us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.655ms
-Self CUDA time total: 3.905us
+Self CPU time total: 1.621ms
+Self CUDA time total: 3.968us
 
 
 
@@ -4016,16 +4016,16 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      67.999us      1388.58%      67.999us      67.999us             1  
-                                      hf_kernels_swiglu         6.71%     113.524us        99.73%       1.687ms       1.687ms       0.000us         0.00%       6.529us       6.529us             1  
-                      _activation_beeaae6::silu_and_mul         1.26%      21.380us        91.91%       1.555ms     518.231us       4.897us       100.00%       6.529us       2.176us             3  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      66.687us      1361.79%      66.687us      66.687us             1  
+                                      hf_kernels_swiglu         6.74%     109.943us        99.70%       1.626ms       1.626ms       0.000us         0.00%       6.529us       6.529us             1  
+                      _activation_beeaae6::silu_and_mul         1.25%      20.459us        91.78%       1.496ms     498.816us       4.897us       100.00%       6.529us       2.176us             3  
 void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       4.897us       100.00%       4.897us       1.632us             3  
-                                Activity Buffer Request        89.08%       1.507ms        89.08%       1.507ms       1.507ms       1.632us        33.33%       1.632us       1.632us             1  
-                                            aten::empty         1.11%      18.802us         1.11%      18.802us       6.267us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         1.56%      26.371us         1.56%      26.371us       8.790us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.27%       4.571us         0.27%       4.571us       4.571us       0.000us         0.00%       0.000us       0.000us             1  
+                                Activity Buffer Request        88.91%       1.450ms        88.91%       1.450ms       1.450ms       1.632us        33.33%       1.632us       1.632us             1  
+                                            aten::empty         1.18%      19.260us         1.18%      19.260us       6.420us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.61%      26.232us         1.61%      26.232us       8.744us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.30%       4.870us         0.30%       4.870us       4.870us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.692ms
+Self CPU time total: 1.631ms
 Self CUDA time total: 4.897us
 
 
@@ -4036,16 +4036,16 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D768
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      63.776us      1498.50%      63.776us      63.776us             1  
-                                      hf_kernels_swiglu         5.54%      99.283us        99.75%       1.788ms       1.788ms       0.000us         0.00%       5.696us       5.696us             1  
-                      _activation_beeaae6::silu_and_mul         1.20%      21.550us        93.21%       1.671ms     556.862us       4.256us       100.00%       5.696us       1.899us             3  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      66.081us      1552.66%      66.081us      66.081us             1  
+                                      hf_kernels_swiglu         6.15%     108.423us        99.71%       1.758ms       1.758ms       0.000us         0.00%       5.696us       5.696us             1  
+                      _activation_beeaae6::silu_and_mul         1.25%      22.001us        92.49%       1.631ms     543.697us       4.256us       100.00%       5.696us       1.899us             3  
 void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       4.256us       100.00%       4.256us       1.419us             3  
-                                Activity Buffer Request        79.15%       1.419ms        79.15%       1.419ms       1.419ms       1.440us        33.83%       1.440us       1.440us             1  
-                                            aten::empty         1.00%      17.972us         1.00%      17.972us       5.991us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        12.85%     230.398us        12.85%     230.398us      76.799us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.25%       4.510us         0.25%       4.510us       4.510us       0.000us         0.00%       0.000us       0.000us             1  
+                                Activity Buffer Request        80.93%       1.427ms        80.93%       1.427ms       1.427ms       1.440us        33.83%       1.440us       1.440us             1  
+                                            aten::empty         1.07%      18.910us         1.07%      18.910us       6.303us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        10.31%     181.874us        10.31%     181.874us      60.625us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.29%       5.110us         0.29%       5.110us       5.110us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.792ms
+Self CPU time total: 1.764ms
 Self CUDA time total: 4.256us
 
 
@@ -4056,17 +4056,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      62.431us      1060.31%      62.431us      62.431us             1  
-                                      hf_kernels_swiglu        20.17%      83.914us        98.89%     411.305us     411.305us       0.000us         0.00%       7.872us       7.872us             1  
-                      _activation_beeaae6::silu_and_mul         5.09%      21.171us        74.40%     309.470us     103.157us       5.888us       100.00%       7.872us       2.624us             3  
-void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       5.888us       100.00%       5.888us       1.963us             3  
-                                Activity Buffer Request        32.60%     135.614us        32.60%     135.614us     135.614us       1.984us        33.70%       1.984us       1.984us             1  
-                                            aten::empty         4.31%      17.921us         4.31%      17.921us       5.974us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        36.71%     152.685us        36.71%     152.685us      50.895us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.11%       4.631us         1.11%       4.631us       4.631us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      63.167us      1072.63%      63.167us      63.167us             1  
+                                      hf_kernels_swiglu        15.22%      87.332us        99.19%     569.294us     569.294us       0.000us         0.00%       7.873us       7.873us             1  
+                      _activation_beeaae6::silu_and_mul         3.58%      20.570us        80.67%     463.002us     154.334us       5.889us       100.00%       7.873us       2.624us             3  
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       5.889us       100.00%       5.889us       1.963us             3  
+                                Activity Buffer Request        48.76%     279.877us        48.76%     279.877us     279.877us       1.984us        33.69%       1.984us       1.984us             1  
+                                            aten::empty         3.30%      18.960us         3.30%      18.960us       6.320us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        28.32%     162.555us        28.32%     162.555us      54.185us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.81%       4.660us         0.81%       4.660us       4.660us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 415.936us
-Self CUDA time total: 5.888us
+Self CPU time total: 573.954us
+Self CUDA time total: 5.889us
 
 
 
@@ -4076,16 +4076,16 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      67.615us       880.40%      67.615us      67.615us             1  
-                                      hf_kernels_swiglu         5.97%     103.444us        99.74%       1.727ms       1.727ms       0.000us         0.00%      10.240us      10.240us             1  
-                      _activation_beeaae6::silu_and_mul         1.23%      21.310us        92.70%       1.605ms     535.135us       7.680us       100.00%      10.240us       3.413us             3  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      69.632us       906.67%      69.632us      69.632us             1  
+                                      hf_kernels_swiglu         6.07%     107.484us        99.73%       1.766ms       1.766ms       0.000us         0.00%      10.240us      10.240us             1  
+                      _activation_beeaae6::silu_and_mul         1.19%      21.010us        92.55%       1.639ms     546.413us       7.680us       100.00%      10.240us       3.413us             3  
 void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       7.680us       100.00%       7.680us       2.560us             3  
-                                Activity Buffer Request        82.79%       1.434ms        82.79%       1.434ms       1.434ms       2.560us        33.33%       2.560us       2.560us             1  
-                                            aten::empty         1.07%      18.611us         1.07%      18.611us       6.204us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         8.68%     150.305us         8.68%     150.305us      50.102us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.26%       4.450us         0.26%       4.450us       4.450us       0.000us         0.00%       0.000us       0.000us             1  
+                                Activity Buffer Request        81.69%       1.447ms        81.69%       1.447ms       1.447ms       2.560us        33.33%       2.560us       2.560us             1  
+                                            aten::empty         1.11%      19.720us         1.11%      19.720us       6.573us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         9.67%     171.234us         9.67%     171.234us      57.078us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.27%       4.800us         0.27%       4.800us       4.800us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.732ms
+Self CPU time total: 1.771ms
 Self CUDA time total: 7.680us
 
 
@@ -4096,17 +4096,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D768
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      63.423us       962.12%      63.423us      63.423us             1  
-                                      hf_kernels_swiglu         5.71%      97.705us        99.74%       1.706ms       1.706ms       0.000us         0.00%       8.800us       8.800us             1  
-                      _activation_beeaae6::silu_and_mul         1.25%      21.440us        92.96%       1.590ms     530.071us       6.592us       100.00%       8.800us       2.933us             3  
-void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       6.592us       100.00%       6.592us       2.197us             3  
-                                Activity Buffer Request        82.94%       1.419ms        82.94%       1.419ms       1.419ms       2.208us        33.50%       2.208us       2.208us             1  
-                                            aten::empty         1.07%      18.230us         1.07%      18.230us       6.077us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         8.77%     149.945us         8.77%     149.945us      49.982us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.26%       4.450us         0.26%       4.450us       4.450us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      72.064us      1098.54%      72.064us      72.064us             1  
+                                      hf_kernels_swiglu         6.19%     109.521us        99.72%       1.763ms       1.763ms       0.000us         0.00%       8.768us       8.768us             1  
+                      _activation_beeaae6::silu_and_mul         1.22%      21.580us        92.43%       1.635ms     544.850us       6.560us       100.00%       8.768us       2.923us             3  
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       6.560us       100.00%       6.560us       2.187us             3  
+                                Activity Buffer Request        81.92%       1.449ms        81.92%       1.449ms       1.449ms       2.208us        33.66%       2.208us       2.208us             1  
+                                            aten::empty         1.09%      19.351us         1.09%      19.351us       6.450us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         9.29%     164.205us         9.29%     164.205us      54.735us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.28%       4.990us         0.28%       4.990us       4.990us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.711ms
-Self CUDA time total: 6.592us
+Self CPU time total: 1.768ms
+Self CUDA time total: 6.560us
 
 
 
@@ -4116,17 +4116,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      61.982us       658.89%      61.982us      61.982us             1  
-                                      hf_kernels_swiglu        22.04%      82.603us        98.77%     370.213us     370.213us       0.000us         0.00%      12.543us      12.543us             1  
-                      _activation_beeaae6::silu_and_mul         5.90%      22.112us        71.72%     268.830us      89.610us       9.407us       100.00%      12.543us       4.181us             3  
-void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       9.407us       100.00%       9.407us       3.136us             3  
-                                Activity Buffer Request        26.16%      98.063us        26.16%      98.063us      98.063us       3.136us        33.34%       3.136us       3.136us             1  
-                                            aten::empty         5.01%      18.780us         5.01%      18.780us       6.260us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        39.66%     148.655us        39.66%     148.655us      49.552us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.23%       4.600us         1.23%       4.600us       4.600us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      65.118us       692.16%      65.118us      65.118us             1  
+                                      hf_kernels_swiglu        16.62%      89.683us        99.03%     534.374us     534.374us       0.000us         0.00%      12.576us      12.576us             1  
+                      _activation_beeaae6::silu_and_mul         3.96%      21.372us        78.99%     426.201us     142.067us       9.408us       100.00%      12.576us       4.192us             3  
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       9.408us       100.00%       9.408us       3.136us             3  
+                                Activity Buffer Request        44.61%     240.735us        44.61%     240.735us     240.735us       3.168us        33.67%       3.168us       3.168us             1  
+                                            aten::empty         3.43%      18.490us         3.43%      18.490us       6.163us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        30.41%     164.094us        30.41%     164.094us      54.698us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.97%       5.210us         0.97%       5.210us       5.210us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 374.813us
-Self CUDA time total: 9.407us
+Self CPU time total: 539.584us
+Self CUDA time total: 9.408us
 
 
 
@@ -4136,17 +4136,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      63.776us       490.85%      63.776us      63.776us             1  
-                                      hf_kernels_swiglu        24.11%      99.284us        98.97%     407.515us     407.515us       0.000us         0.00%      17.346us      17.346us             1  
-                      _activation_beeaae6::silu_and_mul         5.19%      21.351us        70.31%     289.510us      96.503us      12.993us       100.00%      17.346us       5.782us             3  
-void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us      12.993us       100.00%      12.993us       4.331us             3  
-                                Activity Buffer Request        28.96%     119.264us        28.96%     119.264us     119.264us       4.353us        33.50%       4.353us       4.353us             1  
-                                            aten::empty         4.55%      18.721us         4.55%      18.721us       6.240us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        36.16%     148.895us        36.16%     148.895us      49.632us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.03%       4.240us         1.03%       4.240us       4.240us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      69.182us       527.34%      69.182us      69.182us             1  
+                                      hf_kernels_swiglu        12.86%     103.214us        99.41%     797.800us     797.800us       0.000us         0.00%      17.534us      17.534us             1  
+                      _activation_beeaae6::silu_and_mul         2.63%      21.139us        84.20%     675.726us     225.242us      13.119us       100.00%      17.534us       5.845us             3  
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us      13.119us       100.00%      13.119us       4.373us             3  
+                                Activity Buffer Request        61.21%     491.232us        61.21%     491.232us     491.232us       4.415us        33.65%       4.415us       4.415us             1  
+                                            aten::empty         2.35%      18.860us         2.35%      18.860us       6.287us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        20.35%     163.355us        20.35%     163.355us      54.452us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.59%       4.750us         0.59%       4.750us       4.750us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 411.755us
-Self CUDA time total: 12.993us
+Self CPU time total: 802.550us
+Self CUDA time total: 13.119us
 
 
 impl                     wl                  p50(ms)  ok
@@ -4163,13 +4163,12 @@ hf_kernels_swiglu        cuda_T512_D768         0.03  True
 
▶ UV Install Logs
Fetching 7 files: 0%| | 0/7 [00:00<?, ?it/s] -Fetching 7 files: 14%|█▍ | 1/7 [00:00<00:00, 7.79it/s] -Fetching 7 files: 71%|███████▏ | 5/7 [00:00<00:00, 11.48it/s] -Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 15.62it/s]
+Fetching 7 files: 71%|███████▏ | 5/7 [00:00<00:00, 14.29it/s] +Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 19.98it/s]

Artifacts:

activation.jsonl diff --git a/activation/impls/torch_swiglu.html b/activation/impls/torch_swiglu.html index e0544d7b368c13222c83ebad4ecbb275fed41e18..6e53efa4229f749d46be9ca846a20dfeed1ecd5d 100644 --- a/activation/impls/torch_swiglu.html +++ b/activation/impls/torch_swiglu.html @@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: nv | 0.21s +Cell: nv | 0.26s | Raw @@ -3887,7 +3887,7 @@ Cell: nv | 0.21s
-
Tue Oct 28 14:07:54 2025       
+
Wed Oct 29 14:26:44 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -3896,7 +3896,7 @@ Cell: nv | 0.21s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   27C    P0             80W /  350W |       0MiB /  46068MiB |      1%      Default |
+| N/A   32C    P0            133W /  350W |       0MiB /  46068MiB |    100%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3920,7 +3920,7 @@ Cell: nv | 0.21s
 ▼ output
  ▶ uv-logs
  | 
-Cell: benchmark | 6.88s
+Cell: benchmark | 6.86s
  | 
 
 Raw
@@ -3970,20 +3970,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D768
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     206.526us      1621.34%     206.526us     206.526us             1  
-                                            torch_eager        11.16%     213.167us        99.55%       1.902ms       1.902ms       0.000us         0.00%      15.042us      15.042us             1  
-                                             aten::silu         3.29%      62.892us        81.79%       1.563ms     520.961us       6.529us        51.26%       8.833us       2.944us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.529us        51.26%       6.529us       2.176us             3  
-                                              aten::mul         2.06%      39.382us         3.23%      61.724us      20.575us       6.209us        48.74%       6.209us       2.070us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.209us        48.74%       6.209us       2.070us             3  
-                                Activity Buffer Request        76.05%       1.453ms        76.05%       1.453ms       1.453ms       2.304us        18.09%       2.304us       2.304us             1  
-                                            aten::slice         2.72%      51.931us         3.38%      64.581us      10.764us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.66%      12.650us         0.66%      12.650us       2.108us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         3.62%      69.144us         3.62%      69.144us      11.524us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.45%       8.521us         0.45%       8.521us       8.521us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     189.470us      1483.94%     189.470us     189.470us             1  
+                                            torch_eager        11.64%     220.727us        99.60%       1.889ms       1.889ms       0.000us         0.00%      15.103us      15.103us             1  
+                                             aten::silu         3.36%      63.732us        81.84%       1.552ms     517.326us       6.559us        51.37%       8.894us       2.965us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.559us        51.37%       6.559us       2.186us             3  
+                                              aten::mul         1.83%      34.608us         3.05%      57.780us      19.260us       6.209us        48.63%       6.209us       2.070us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.209us        48.63%       6.209us       2.070us             3  
+                                Activity Buffer Request        76.17%       1.444ms        76.17%       1.444ms       1.444ms       2.335us        18.29%       2.335us       2.335us             1  
+                                            aten::slice         2.47%      46.790us         3.07%      58.281us       9.714us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.61%      11.491us         0.61%      11.491us       1.915us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         3.54%      67.043us         3.54%      67.043us      11.174us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.40%       7.531us         0.40%       7.531us       7.531us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.911ms
-Self CUDA time total: 12.738us
+Self CPU time total: 1.896ms
+Self CUDA time total: 12.768us
 
 
 
@@ -3993,20 +3993,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     151.965us      1227.21%     151.965us     151.965us             1  
-                                            torch_eager         7.02%     119.974us        99.63%       1.704ms       1.704ms       0.000us         0.00%      14.558us      14.558us             1  
-                                             aten::silu         2.35%      40.140us        88.12%       1.507ms     502.320us       6.399us        51.68%       8.574us       2.858us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     160.895us      1299.43%     160.895us     160.895us             1  
+                                            torch_eager         6.82%     117.243us        99.71%       1.713ms       1.713ms       0.000us         0.00%      14.558us      14.558us             1  
+                                             aten::silu         2.46%      42.340us        88.23%       1.516ms     505.362us       6.399us        51.68%       8.575us       2.858us             3  
 void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.399us        51.68%       6.399us       2.133us             3  
-                                              aten::mul         1.61%      27.481us         2.72%      46.541us      15.514us       5.984us        48.32%       5.984us       1.995us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.984us        48.32%       5.984us       1.995us             3  
-                                Activity Buffer Request        84.14%       1.439ms        84.14%       1.439ms       1.439ms       2.175us        17.56%       2.175us       2.175us             1  
-                                            aten::slice         1.43%      24.471us         1.78%      30.412us       5.069us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.35%       5.941us         0.35%       5.941us       0.990us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         2.74%      46.851us         2.74%      46.851us       7.809us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.37%       6.320us         0.37%       6.320us       6.320us       0.000us         0.00%       0.000us       0.000us             1  
+                                              aten::mul         1.64%      28.101us         2.83%      48.681us      16.227us       5.983us        48.32%       5.983us       1.994us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.983us        48.32%       5.983us       1.994us             3  
+                                Activity Buffer Request        84.10%       1.445ms        84.10%       1.445ms       1.445ms       2.176us        17.57%       2.176us       2.176us             1  
+                                            aten::slice         1.47%      25.252us         1.82%      31.222us       5.204us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.35%       5.970us         0.35%       5.970us       0.995us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         2.87%      49.290us         2.87%      49.290us       8.215us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.29%       5.020us         0.29%       5.020us       5.020us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.710ms
-Self CUDA time total: 12.383us
+Self CPU time total: 1.718ms
+Self CUDA time total: 12.382us
 
 
 
@@ -4016,20 +4016,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     151.008us      1139.77%     151.008us     151.008us             1  
-                                            torch_eager         6.34%     107.173us        99.70%       1.687ms       1.687ms       0.000us         0.00%      15.522us      15.522us             1  
-                                             aten::silu         2.38%      40.332us        88.83%       1.503ms     500.911us       6.817us        51.45%       9.090us       3.030us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.817us        51.45%       6.817us       2.272us             3  
-                                              aten::mul         1.57%      26.503us         2.73%      46.253us      15.418us       6.432us        48.55%       6.432us       2.144us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.432us        48.55%       6.432us       2.144us             3  
-                                Activity Buffer Request        84.91%       1.436ms        84.91%       1.436ms       1.436ms       2.273us        17.16%       2.273us       2.273us             1  
-                                            aten::slice         1.43%      24.250us         1.81%      30.550us       5.092us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.37%       6.300us         0.37%       6.300us       1.050us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         2.70%      45.731us         2.70%      45.731us       7.622us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.30%       5.000us         0.30%       5.000us       5.000us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     157.982us      1195.38%     157.982us     157.982us             1  
+                                            torch_eager         6.51%     110.244us        99.65%       1.686ms       1.686ms       0.000us         0.00%      15.488us      15.488us             1  
+                                             aten::silu         2.52%      42.653us        88.50%       1.498ms     499.192us       6.784us        51.33%       9.056us       3.019us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.784us        51.33%       6.784us       2.261us             3  
+                                              aten::mul         1.66%      28.021us         2.76%      46.791us      15.597us       6.432us        48.67%       6.432us       2.144us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.432us        48.67%       6.432us       2.144us             3  
+                                Activity Buffer Request        84.30%       1.427ms        84.30%       1.427ms       1.427ms       2.272us        17.19%       2.272us       2.272us             1  
+                                            aten::slice         1.51%      25.627us         1.87%      31.700us       5.283us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.36%       6.073us         0.36%       6.073us       1.012us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         2.78%      47.050us         2.78%      47.050us       7.842us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.35%       5.950us         0.35%       5.950us       5.950us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
 Self CPU time total: 1.692ms
-Self CUDA time total: 13.249us
+Self CUDA time total: 13.216us
 
 
 
@@ -4039,20 +4039,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D768
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     153.149us      1202.68%     153.149us     153.149us             1  
-                                            torch_eager         6.34%     109.104us        99.71%       1.717ms       1.717ms       0.000us         0.00%      14.941us      14.941us             1  
-                                             aten::silu         2.38%      40.982us        88.93%       1.531ms     510.411us       6.558us        51.50%       8.765us       2.922us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.558us        51.50%       6.558us       2.186us             3  
-                                              aten::mul         1.52%      26.241us         2.68%      46.222us      15.407us       6.176us        48.50%       6.176us       2.059us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.176us        48.50%       6.176us       2.059us             3  
-                                Activity Buffer Request        73.41%       1.264ms        73.41%       1.264ms       1.264ms       2.207us        17.33%       2.207us       2.207us             1  
-                                            aten::slice         1.43%      24.560us         1.77%      30.400us       5.067us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.34%       5.840us         0.34%       5.840us       0.973us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        14.29%     246.139us        14.29%     246.139us      41.023us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.29%       4.920us         0.29%       4.920us       4.920us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     159.902us      1258.67%     159.902us     159.902us             1  
+                                            torch_eager         6.73%     114.317us        99.66%       1.694ms       1.694ms       0.000us         0.00%      14.912us      14.912us             1  
+                                             aten::silu         2.46%      41.881us        88.34%       1.501ms     500.465us       6.560us        51.64%       8.768us       2.923us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.560us        51.64%       6.560us       2.187us             3  
+                                              aten::mul         1.68%      28.581us         2.79%      47.441us      15.814us       6.144us        48.36%       6.144us       2.048us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.144us        48.36%       6.144us       2.048us             3  
+                                Activity Buffer Request        74.33%       1.263ms        74.33%       1.263ms       1.263ms       2.208us        17.38%       2.208us       2.208us             1  
+                                            aten::slice         1.44%      24.468us         1.80%      30.638us       5.106us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.36%       6.170us         0.36%       6.170us       1.028us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        12.65%     214.994us        12.65%     214.994us      35.832us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.34%       5.830us         0.34%       5.830us       5.830us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.722ms
-Self CUDA time total: 12.734us
+Self CPU time total: 1.700ms
+Self CUDA time total: 12.704us
 
 
 
@@ -4062,20 +4062,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     149.310us      1126.87%     149.310us     149.310us             1  
-                                            torch_eager         5.88%     107.113us        99.73%       1.817ms       1.817ms       0.000us         0.00%      15.555us      15.555us             1  
-                                             aten::silu         2.34%      42.602us        89.83%       1.636ms     545.432us       6.785us        51.21%       9.090us       3.030us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.785us        51.21%       6.785us       2.262us             3  
-                                              aten::mul         1.33%      24.312us         2.33%      42.512us      14.171us       6.465us        48.79%       6.465us       2.155us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.465us        48.79%       6.465us       2.155us             3  
-                                Activity Buffer Request        78.20%       1.424ms        78.20%       1.424ms       1.424ms       2.305us        17.40%       2.305us       2.305us             1  
-                                            aten::slice         1.35%      24.650us         1.68%      30.660us       5.110us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.33%       6.010us         0.33%       6.010us       1.002us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        10.29%     187.406us        10.29%     187.406us      31.234us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.27%       4.950us         0.27%       4.950us       4.950us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     157.053us      1185.48%     157.053us     157.053us             1  
+                                            torch_eager         6.08%     111.294us        99.69%       1.824ms       1.824ms       0.000us         0.00%      15.552us      15.552us             1  
+                                             aten::silu         2.39%      43.729us        89.42%       1.636ms     545.306us       6.784us        51.21%       9.088us       3.029us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.784us        51.21%       6.784us       2.261us             3  
+                                              aten::mul         1.44%      26.361us         2.52%      46.181us      15.394us       6.464us        48.79%       6.464us       2.155us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.464us        48.79%       6.464us       2.155us             3  
+                                Activity Buffer Request        77.97%       1.426ms        77.97%       1.426ms       1.426ms       2.304us        17.39%       2.304us       2.304us             1  
+                                            aten::slice         1.34%      24.571us         1.66%      30.441us       5.074us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.32%       5.870us         0.32%       5.870us       0.978us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.14%     185.544us        10.14%     185.544us      30.924us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.31%       5.601us         0.31%       5.601us       5.601us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.822ms
-Self CUDA time total: 13.250us
+Self CPU time total: 1.829ms
+Self CUDA time total: 13.248us
 
 
 
@@ -4085,20 +4085,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     143.804us       924.73%     143.804us     143.804us             1  
-                                            torch_eager        21.50%     103.524us        99.01%     476.736us     476.736us       0.000us         0.00%      18.271us      18.271us             1  
-                                             aten::silu         8.70%      41.893us        62.70%     301.891us     100.630us       7.999us        51.44%      10.719us       3.573us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.999us        51.44%       7.999us       2.666us             3  
-                                              aten::mul         5.07%      24.390us         8.83%      42.521us      14.174us       7.552us        48.56%       7.552us       2.517us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.552us        48.56%       7.552us       2.517us             3  
-                                Activity Buffer Request        22.22%     106.973us        22.22%     106.973us     106.973us       2.720us        17.49%       2.720us       2.720us             1  
-                                            aten::slice         4.80%      23.090us         5.98%      28.800us       4.800us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         1.19%       5.710us         1.19%       5.710us       0.952us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        35.55%     171.156us        35.55%     171.156us      28.526us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.99%       4.760us         0.99%       4.760us       4.760us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     151.390us       977.47%     151.390us     151.390us             1  
+                                            torch_eager        22.03%     109.975us        99.02%     494.363us     494.363us       0.000us         0.00%      18.176us      18.176us             1  
+                                             aten::silu         8.41%      41.971us        61.88%     308.937us     102.979us       7.936us        51.24%      10.624us       3.541us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.936us        51.24%       7.936us       2.645us             3  
+                                              aten::mul         5.23%      26.101us         8.92%      44.531us      14.844us       7.552us        48.76%       7.552us       2.517us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.552us        48.76%       7.552us       2.517us             3  
+                                Activity Buffer Request        22.19%     110.773us        22.19%     110.773us     110.773us       2.688us        17.36%       2.688us       2.688us             1  
+                                            aten::slice         5.05%      25.220us         6.19%      30.920us       5.153us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         1.14%       5.700us         1.14%       5.700us       0.950us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        34.98%     174.623us        34.98%     174.623us      29.104us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.98%       4.900us         0.98%       4.900us       4.900us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 481.496us
-Self CUDA time total: 15.551us
+Self CPU time total: 499.263us
+Self CUDA time total: 15.488us
 
 
 
@@ -4108,20 +4108,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D768
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     153.372us      1067.46%     153.372us     153.372us             1  
-                                            torch_eager         5.96%     108.164us        99.73%       1.810ms       1.810ms       0.000us         0.00%      16.832us      16.832us             1  
-                                             aten::silu         2.30%      41.731us        89.59%       1.626ms     541.925us       7.360us        51.22%       9.824us       3.275us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.360us        51.22%       7.360us       2.453us             3  
-                                              aten::mul         1.41%      25.542us         2.47%      44.792us      14.931us       7.008us        48.78%       7.008us       2.336us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.008us        48.78%       7.008us       2.336us             3  
-                                Activity Buffer Request        78.82%       1.430ms        78.82%       1.430ms       1.430ms       2.464us        17.15%       2.464us       2.464us             1  
-                                            aten::slice         1.37%      24.840us         1.70%      30.900us       5.150us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.33%       6.060us         0.33%       6.060us       1.010us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         9.53%     172.976us         9.53%     172.976us      28.829us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.27%       4.960us         0.27%       4.960us       4.960us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     163.583us      1143.70%     163.583us     163.583us             1  
+                                            torch_eager         6.28%     116.052us        99.70%       1.841ms       1.841ms       0.000us         0.00%      16.767us      16.767us             1  
+                                             aten::silu         2.27%      41.942us        89.09%       1.645ms     548.450us       7.327us        51.23%       9.791us       3.264us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.327us        51.23%       7.327us       2.442us             3  
+                                              aten::mul         1.55%      28.681us         2.62%      48.392us      16.131us       6.976us        48.77%       6.976us       2.325us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.976us        48.77%       6.976us       2.325us             3  
+                                Activity Buffer Request        78.22%       1.445ms        78.22%       1.445ms       1.445ms       2.464us        17.23%       2.464us       2.464us             1  
+                                            aten::slice         1.38%      25.430us         1.70%      31.392us       5.232us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.32%       5.962us         0.32%       5.962us       0.994us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         9.67%     178.614us         9.67%     178.614us      29.769us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.30%       5.570us         0.30%       5.570us       5.570us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.815ms
-Self CUDA time total: 14.368us
+Self CPU time total: 1.847ms
+Self CUDA time total: 14.303us
 
 
 
@@ -4131,20 +4131,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     146.240us       942.27%     146.240us     146.240us             1  
-                                            torch_eager        22.59%     104.486us        98.96%     457.726us     457.726us       0.000us         0.00%      18.208us      18.208us             1  
-                                             aten::silu         8.78%      40.590us        60.43%     279.519us      93.173us       7.936us        51.13%      10.624us       3.541us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.936us        51.13%       7.936us       2.645us             3  
-                                              aten::mul         5.53%      25.579us         9.45%      43.730us      14.577us       7.584us        48.87%       7.584us       2.528us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.584us        48.87%       7.584us       2.528us             3  
-                                Activity Buffer Request        18.85%      87.193us        18.85%      87.193us      87.193us       2.688us        17.32%       2.688us       2.688us             1  
-                                            aten::slice         5.23%      24.201us         6.48%      29.991us       4.999us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         1.25%       5.790us         1.25%       5.790us       0.965us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        36.73%     169.887us        36.73%     169.887us      28.314us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         1.04%       4.800us         1.04%       4.800us       4.800us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     150.172us       969.60%     150.172us     150.172us             1  
+                                            torch_eager        23.07%     110.204us        98.98%     472.752us     472.752us       0.000us         0.00%      18.176us      18.176us             1  
+                                             aten::silu         9.08%      43.371us        60.20%     287.547us      95.849us       7.936us        51.24%      10.624us       3.541us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.936us        51.24%       7.936us       2.645us             3  
+                                              aten::mul         5.48%      26.181us         9.38%      44.801us      14.934us       7.552us        48.76%       7.552us       2.517us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.552us        48.76%       7.552us       2.517us             3  
+                                Activity Buffer Request        19.26%      92.002us        19.26%      92.002us      92.002us       2.688us        17.36%       2.688us       2.688us             1  
+                                            aten::slice         5.00%      23.870us         6.32%      30.200us       5.033us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         1.33%       6.330us         1.33%       6.330us       1.055us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        35.76%     170.794us        35.76%     170.794us      28.466us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         1.02%       4.871us         1.02%       4.871us       4.871us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 462.526us
-Self CUDA time total: 15.520us
+Self CPU time total: 477.623us
+Self CUDA time total: 15.488us
 
 
 
@@ -4154,20 +4154,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     181.470us       803.28%     181.470us     181.470us             1  
-                                            torch_eager         5.97%     109.125us        99.74%       1.823ms       1.823ms       0.000us         0.00%      26.526us      26.526us             1  
-                                             aten::silu         2.38%      43.492us        88.50%       1.617ms     539.072us      11.647us        51.56%      15.582us       5.194us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      11.647us        51.56%      11.647us       3.882us             3  
-                                              aten::mul         1.42%      25.882us         3.51%      64.123us      21.374us      10.944us        48.44%      10.944us       3.648us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.944us        48.44%      10.944us       3.648us             3  
-                                Activity Buffer Request        77.67%       1.419ms        77.67%       1.419ms       1.419ms       3.935us        17.42%       3.935us       3.935us             1  
-                                            aten::slice         1.42%      25.910us         1.76%      32.089us       5.348us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.34%       6.179us         0.34%       6.179us       1.030us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        10.54%     192.606us        10.54%     192.606us      32.101us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.26%       4.790us         0.26%       4.790us       4.790us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     160.000us       713.30%     160.000us     160.000us             1  
+                                            torch_eager         5.99%     109.975us        99.73%       1.831ms       1.831ms       0.000us         0.00%      26.335us      26.335us             1  
+                                             aten::silu         2.30%      42.230us        89.52%       1.643ms     547.763us      11.583us        51.64%      15.487us       5.162us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      11.583us        51.64%      11.583us       3.861us             3  
+                                              aten::mul         1.54%      28.250us         2.52%      46.180us      15.393us      10.848us        48.36%      10.848us       3.616us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.848us        48.36%      10.848us       3.616us             3  
+                                Activity Buffer Request        78.83%       1.447ms        78.83%       1.447ms       1.447ms       3.904us        17.40%       3.904us       3.904us             1  
+                                            aten::slice         1.37%      25.211us         1.70%      31.261us       5.210us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.33%       6.050us         0.33%       6.050us       1.008us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         9.37%     171.964us         9.37%     171.964us      28.661us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.27%       4.930us         0.27%       4.930us       4.930us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.827ms
-Self CUDA time total: 22.591us
+Self CPU time total: 1.836ms
+Self CUDA time total: 22.431us
 
 
 impl                     wl                  p50(ms)  ok
@@ -4184,7 +4184,7 @@ torch_eager              cuda_T512_D768         0.05  True
 
▶ UV Install Logs
diff --git a/activation/results/artifacts/combine/latency.svg b/activation/results/artifacts/combine/latency.svg index 2eb6d36da2a386c6f3b7ffe7a4f2ecf07fbe531d..b809b51f58837145ae3fdbcb04aa1aec4a5e023e 100644 --- a/activation/results/artifacts/combine/latency.svg +++ b/activation/results/artifacts/combine/latency.svg @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:431dea6a591fc822f7d0d0d6f793e8c11170edb647c627b5a44ad9883df2c3fc -size 20697 +oid sha256:f62c7d85fc4a76cf7a1060a62df99ff0d32133ab94bb502b68dcd53171c39602 +size 21424 diff --git a/activation/results/combined_results.html b/activation/results/combined_results.html index f11a4ea4cf1c2f2bfbc419d5616f99db4990e15c..35064093e9085dbed21e2edd8a0a4e6c497bbb9d 100644 --- a/activation/results/combined_results.html +++ b/activation/results/combined_results.html @@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: - 2025-10-28T14:09:13.211569 + 2025-10-29T14:27:49.999657 image/svg+xml @@ -4021,83 +4021,96 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: - + - + - 0.025 + 0.025 - + - + - 0.030 + 0.030 - + - + - 0.035 + 0.035 - + - + - 0.040 + 0.040 - + - + - 0.045 + 0.045 - + - + - 0.050 + 0.050 + + + + + + + + + + + + + 0.055 @@ -4105,37 +4118,37 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: - + - - - - - - - - + + + + + + + + - + - + - - - - - - - + + + + + + + @@ -4150,30 +4163,30 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: - + Attention Implementation Latency - + - - + + - + - hf_kernels_swiglu + hf_kernels_swiglu - - + + - + - torch_eager + torch_eager @@ -4193,7 +4206,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: combine | 4.28s +Cell: combine | 4.24s | Raw @@ -4319,7 +4332,7 @@ Implementations included:
▶ UV Install Logs
@@ -4332,7 +4345,7 @@ Installed 37 packages in 195ms - 2025-10-28T14:09:13.211569 + 2025-10-29T14:27:49.999657 image/svg+xml @@ -4481,83 +4494,96 @@ Installed 37 packages in 195ms - + - + - 0.025 + 0.025 - + - + - 0.030 + 0.030 - + - + - 0.035 + 0.035 - + - + - 0.040 + 0.040 - + - + - 0.045 + 0.045 - + - + - 0.050 + 0.050 + + + + + + + + + + + + + 0.055 @@ -4565,37 +4591,37 @@ Installed 37 packages in 195ms - + - - - - - - - - + + + + + + + + - + - + - - - - - - - + + + + + + + @@ -4610,30 +4636,30 @@ Installed 37 packages in 195ms - + Attention Implementation Latency - + - - + + - + - hf_kernels_swiglu + hf_kernels_swiglu - - + + - + - torch_eager + torch_eager diff --git a/causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl b/causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl index 062646d5a3f22298019a79ab8e52f52ea42bd834..3c3e9cb1937f70bc8a6005f64424ae1ae23f373f 100644 --- a/causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl +++ b/causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl @@ -1,24 +1,24 @@ -{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S128_W2", "batch": 2, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.06712200001857127, "p50": 0.06883200001084333, "p90": 0.06976199995278876, "mean": 0.06901199997173535, "iqr": 0.0014600000213249587, "raw_times": [0.06976199995278876, 0.07104199994500959, 0.06712200001857127, 0.0683019999314638, 0.06883200001084333], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0738530000035098, "peak_bytes": 295936, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S128_W4", "batch": 2, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08455299996512622, "p50": 0.08599400007369695, "p90": 0.0868530000843748, "mean": 0.08612520005044644, "iqr": 0.0014299999975264654, "raw_times": [0.08780300004218589, 0.08455299996512622, 0.0868530000843748, 0.08542300008684833, 0.08599400007369695], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08941300006881647, "peak_bytes": 296448, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S512_W2", "batch": 2, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08494299993344612, "p50": 0.08714299997336639, "p90": 0.08724299993900786, "mean": 0.086546999955317, "iqr": 0.0020200000108161476, "raw_times": [0.08522299992819171, 0.08714299997336639, 0.08818300000257295, 0.08724299993900786, 0.08494299993344612], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09105300000555872, "peak_bytes": 1180672, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S512_W4", "batch": 2, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08327299997290538, "p50": 0.084122999965075, "p90": 0.08580299993354856, "mean": 0.08452299998680246, "iqr": 0.0023699999474047218, "raw_times": [0.08327299997290538, 0.084122999965075, 0.08598300007633952, 0.08580299993354856, 0.08343299998614384], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08891300001323543, "peak_bytes": 1181184, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S2048_W2", "batch": 2, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08298299997022696, "p50": 0.08508299993081891, "p90": 0.08600299997851835, "mean": 0.0849267999683434, "iqr": 0.0016210000239880173, "raw_times": [0.08298299997022696, 0.08508299993081891, 0.08600299997851835, 0.08438199995453033, 0.08618300000762247], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08780300004218589, "peak_bytes": 4719616, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S2048_W4", "batch": 2, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08270299997548136, "p50": 0.08315299999139825, "p90": 0.0846430000365217, "mean": 0.08407499999520951, "iqr": 0.0019010000187336118, "raw_times": [0.08315299999139825, 0.08713399995485815, 0.08270299997548136, 0.08274200001778809, 0.0846430000365217], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08981299993138236, "peak_bytes": 4720128, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S128_W2", "batch": 2, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08372299998882227, "p50": 0.08510199995725998, "p90": 0.08608299992829416, "mean": 0.08701479998762807, "iqr": 0.0011499998890940333, "raw_times": [0.08493300003920012, 0.09523300002456381, 0.08510199995725998, 0.08372299998882227, 0.08608299992829416], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08923300003971235, "peak_bytes": 9461760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S128_W4", "batch": 2, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08285199999136239, "p50": 0.08483300007355865, "p90": 0.08511300006830425, "mean": 0.08449480001218035, "iqr": 0.0016500000583619112, "raw_times": [0.08285199999136239, 0.08346300000994233, 0.08483300007355865, 0.08621299991773412, 0.08511300006830425], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08870299996033282, "peak_bytes": 9478144, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S512_W2", "batch": 2, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08278300003894401, "p50": 0.08427300008406746, "p90": 0.08444299999155191, "mean": 0.08422300002166594, "iqr": 0.0002599999788799323, "raw_times": [0.08444299999155191, 0.08418300001267198, 0.08278300003894401, 0.08543299998109433, 0.08427300008406746], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08903299999474257, "peak_bytes": 37773312, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S512_W4", "batch": 2, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08352199995442788, "p50": 0.0842329999386493, "p90": 0.08553300006042264, "mean": 0.08496079999531503, "iqr": 0.0014400000054592965, "raw_times": [0.08409300005496334, 0.08742299996811198, 0.08553300006042264, 0.08352199995442788, 0.0842329999386493], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08985400006622513, "peak_bytes": 37789696, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S2048_W2", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.14414499992199126, "p50": 0.14512600000671227, "p90": 0.14515400005166157, "mean": 0.1465472000063528, "iqr": 0.0008580000212532468, "raw_times": [0.14512600000671227, 0.14414499992199126, 0.14429600003040832, 0.15401500002099056, 0.14515400005166157], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.14571500003057736, "peak_bytes": 151019520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S2048_W4", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.16020600003230356, "p50": 0.16135600003508443, "p90": 0.16139600006681576, "mean": 0.16140360005465482, "iqr": 0.00029099999210302485, "raw_times": [0.16139600006681576, 0.1629550000643576, 0.16110500007471273, 0.16020600003230356, 0.16135600003508443], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.1623660000404925, "peak_bytes": 151035904, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S128_W2", "batch": 4, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07979300005445111, "p50": 0.08039299996198679, "p90": 0.08136300004935038, "mean": 0.08070500002759218, "iqr": 0.001150000002780871, "raw_times": [0.0802130000465695, 0.0817630000256031, 0.07979300005445111, 0.08039299996198679, 0.08136300004935038], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0855329999467358, "peak_bytes": 33727488, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S128_W4", "batch": 4, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0800829999434427, "p50": 0.08147299990923784, "p90": 0.08197300007850572, "mean": 0.08146099996793055, "iqr": 0.00109000018255756, "raw_times": [0.0800829999434427, 0.08197300007850572, 0.08147299990923784, 0.08289300001251831, 0.08088299989594816], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08291199992527254, "peak_bytes": 591360, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S512_W2", "batch": 4, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0799729999698684, "p50": 0.08137199995417177, "p90": 0.081513000054656, "mean": 0.08127659998535819, "iqr": 0.0006500000608866685, "raw_times": [0.0799729999698684, 0.08266199995432544, 0.081513000054656, 0.08086299999376934, 0.08137199995417177], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08939400004237541, "peak_bytes": 2360320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S512_W4", "batch": 4, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08067300007041922, "p50": 0.08162300002823031, "p90": 0.08189199991193163, "mean": 0.08365860001049441, "iqr": 0.0008099999604382901, "raw_times": [0.08067300007041922, 0.08108199995149334, 0.08189199991193163, 0.08162300002823031, 0.09302300009039755], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08415299998887349, "peak_bytes": 2360832, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S2048_W2", "batch": 4, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0806030000148894, "p50": 0.08186299999124458, "p90": 0.08199299998068454, "mean": 0.08162900001025264, "iqr": 0.001009999891721236, "raw_times": [0.08270299997548136, 0.08186299999124458, 0.0806030000148894, 0.08199299998068454, 0.08098300008896331], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.10199300004387624, "peak_bytes": 9438208, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S2048_W4", "batch": 4, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08040199998049502, "p50": 0.08168299996214046, "p90": 0.08185199999388715, "mean": 0.08171659999334224, "iqr": 0.0013889999763705418, "raw_times": [0.0804630000175166, 0.08418300001267198, 0.08168299996214046, 0.08040199998049502, 0.08185199999388715], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08522300004187855, "peak_bytes": 9438720, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S128_W2", "batch": 4, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08097300008103048, "p50": 0.08150300004672317, "p90": 0.08173299988811777, "mean": 0.08153900000706926, "iqr": 0.0005599998758043512, "raw_times": [0.08117300001231342, 0.08231300000716146, 0.08150300004672317, 0.08173299988811777, 0.08097300008103048], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08440300007350743, "peak_bytes": 18931712, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S128_W4", "batch": 4, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0802130000465695, "p50": 0.08124300006784324, "p90": 0.08242299998073577, "mean": 0.08162480000919459, "iqr": 0.0012000000424450263, "raw_times": [0.0802130000465695, 0.08302200001253368, 0.08242299998073577, 0.08124300006784324, 0.08122299993829074], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08460300000479037, "peak_bytes": 18948096, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S512_W2", "batch": 4, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09226300005593657, "p50": 0.09320300000581483, "p90": 0.0934630000983816, "mean": 0.09316100004070904, "iqr": 0.0007800000503266347, "raw_times": [0.09419299999535724, 0.09320300000581483, 0.0934630000983816, 0.09226300005593657, 0.09268300004805496], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0951240000404141, "peak_bytes": 75522048, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S512_W4", "batch": 4, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09887299995625654, "p50": 0.09917300008055463, "p90": 0.09990300009121711, "mean": 0.09939520000443736, "iqr": 0.0009100001534534385, "raw_times": [0.09887299995625654, 0.09917300008055463, 0.09990300009121711, 0.10003399995639484, 0.09899299993776367], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.1023739999936879, "peak_bytes": 75538432, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S2048_W2", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.4842959999677987, "p50": 0.4860569999891595, "p90": 0.4878769999550059, "mean": 0.48646659997757524, "iqr": 0.002959999960694404, "raw_times": [0.4849169999943115, 0.4860569999891595, 0.4878769999550059, 0.4842959999677987, 0.4891859999816006], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.4877669999814316, "peak_bytes": 302014464, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S2048_W4", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.4968179999877975, "p50": 0.49805800006197387, "p90": 0.4990780000753148, "mean": 0.4983496000022569, "iqr": 0.001141000097959477, "raw_times": [0.4979369999773553, 0.49985699990884314, 0.4990780000753148, 0.49805800006197387, 0.4968179999877975], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.49727700002222264, "peak_bytes": 302030848, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T14:27:35Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S128_W2", "batch": 2, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.047051000024112, "p50": 0.048391000007086404, "p90": 0.048571999968771706, "mean": 0.04890720000503279, "iqr": 0.0005509999709829572, "raw_times": [0.0525010000274051, 0.048571999968771706, 0.04802099999778875, 0.048391000007086404, 0.047051000024112], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.058991999992485944, "peak_bytes": 295936, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S128_W4", "batch": 2, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05480199996554802, "p50": 0.05610099998420992, "p90": 0.05628099995647062, "mean": 0.056069199968078465, "iqr": 0.0006299999881775875, "raw_times": [0.057510999965870724, 0.05628099995647062, 0.05610099998420992, 0.055650999968293036, 0.05480199996554802], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.06025200002568454, "peak_bytes": 296448, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S512_W2", "batch": 2, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05469199999197372, "p50": 0.056971999981669796, "p90": 0.057361000017408514, "mean": 0.056363600003805914, "iqr": 0.0025490000439276628, "raw_times": [0.05469199999197372, 0.057361000017408514, 0.056971999981669796, 0.05798100005449669, 0.05481199997348085], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0596809999819925, "peak_bytes": 1180672, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S512_W4", "batch": 2, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05416100003685642, "p50": 0.05502099998011545, "p90": 0.05511200004093553, "mean": 0.05489540000098714, "iqr": 0.00016000007008187822, "raw_times": [0.05416100003685642, 0.05495199997085365, 0.055230999976174644, 0.05511200004093553, 0.05502099998011545], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05659100003185813, "peak_bytes": 1181184, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S2048_W2", "batch": 2, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052401000004920206, "p50": 0.055000999964249786, "p90": 0.056301000029179704, "mean": 0.054810999995424936, "iqr": 0.0023400000372930663, "raw_times": [0.052401000004920206, 0.056301000029179704, 0.056390999986888346, 0.05396099999188664, 0.055000999964249786], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05838100003074942, "peak_bytes": 4719616, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S2048_W4", "batch": 2, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05270100001553146, "p50": 0.05342100001826111, "p90": 0.054111999986616865, "mean": 0.053611199996339565, "iqr": 0.0008709999974598759, "raw_times": [0.05324099998915699, 0.054580999972131394, 0.054111999986616865, 0.05342100001826111, 0.05270100001553146], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0580610000042725, "peak_bytes": 4720128, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S128_W2", "batch": 2, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.051810999991630524, "p50": 0.05310099999178419, "p90": 0.053301000036753976, "mean": 0.05577720000928821, "iqr": 0.0007700000423938036, "raw_times": [0.051810999991630524, 0.05253099999436017, 0.06814200003191218, 0.053301000036753976, 0.05310099999178419], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056801999960498506, "peak_bytes": 9461760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S128_W4", "batch": 2, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.053270999956112064, "p50": 0.05397199998924407, "p90": 0.05399100001568513, "mean": 0.054061200000887766, "iqr": 0.00023000001192485797, "raw_times": [0.05399100001568513, 0.05531100003963729, 0.053270999956112064, 0.05397199998924407, 0.05376100000376027], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056641000014678866, "peak_bytes": 9478144, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S512_W2", "batch": 2, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052231000040592335, "p50": 0.052561000018158666, "p90": 0.0526110000009794, "mean": 0.0529970000116009, "iqr": 0.0002200000039920269, "raw_times": [0.052231000040592335, 0.052390999996987375, 0.05519100000128674, 0.052561000018158666, 0.0526110000009794], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05621100001462764, "peak_bytes": 37773312, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S512_W4", "batch": 2, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052561000018158666, "p50": 0.053772000001117704, "p90": 0.05471100001841478, "mean": 0.057879400003457704, "iqr": 0.0021000000174353772, "raw_times": [0.0526110000009794, 0.05471100001841478, 0.053772000001117704, 0.052561000018158666, 0.07574199997861797], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.06976200000963217, "peak_bytes": 37789696, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S2048_W2", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.051341000016691396, "p50": 0.05308099997591853, "p90": 0.05349200000637211, "mean": 0.05272739998645193, "iqr": 0.0014310000437944836, "raw_times": [0.052060999962577625, 0.05366199997069998, 0.05308099997591853, 0.051341000016691396, 0.05349200000637211], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05661099999088037, "peak_bytes": 151019520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S2048_W4", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05081099999415528, "p50": 0.053202000003693684, "p90": 0.05362099994954406, "mean": 0.05282339998302632, "iqr": 0.0011499999459374521, "raw_times": [0.05081099999415528, 0.052471000003606605, 0.05362099994954406, 0.054011999964131974, 0.053202000003693684], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05642200000011144, "peak_bytes": 151035904, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S128_W2", "batch": 4, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052221000032659504, "p50": 0.05397099999981947, "p90": 0.05448100000648992, "mean": 0.05380100001275423, "iqr": 0.001479999980347202, "raw_times": [0.052221000032659504, 0.05300100002614272, 0.055330999998659536, 0.05448100000648992, 0.05397099999981947], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.058421999995061924, "peak_bytes": 33727488, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S128_W4", "batch": 4, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05164199995988383, "p50": 0.053460999993149017, "p90": 0.05421100001967716, "mean": 0.0532791999944493, "iqr": 0.00139000002263856, "raw_times": [0.0528209999970386, 0.054261000002497894, 0.05421100001967716, 0.05164199995988383, 0.053460999993149017], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05741199998965385, "peak_bytes": 591360, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S512_W2", "batch": 4, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052460999995673774, "p50": 0.0557109999590466, "p90": 0.05585100001326282, "mean": 0.05600519999688913, "iqr": 0.002330000029360235, "raw_times": [0.052460999995673774, 0.0557109999590466, 0.05585100001326282, 0.053520999983902584, 0.06248200003255988], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.058421000005637325, "peak_bytes": 2360320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S512_W4", "batch": 4, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0540510000064387, "p50": 0.0541219999945497, "p90": 0.05425200004083308, "mean": 0.054337400013082515, "iqr": 0.0001910000264615519, "raw_times": [0.0540510000064387, 0.05425200004083308, 0.05520100000921957, 0.0541219999945497, 0.05406100001437153], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05772200000819794, "peak_bytes": 2360832, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S2048_W2", "batch": 4, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05302099998516496, "p50": 0.05408099997339377, "p90": 0.0544409999747586, "mean": 0.05571119997966889, "iqr": 0.0008299999763039523, "raw_times": [0.05302099998516496, 0.0544409999747586, 0.06340199996657248, 0.053610999998454645, 0.05408099997339377], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05787100002407897, "peak_bytes": 9438208, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S2048_W4", "batch": 4, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052391999986411975, "p50": 0.05292200000894809, "p90": 0.053131000015582686, "mean": 0.05318180001268047, "iqr": 0.00026899999738816405, "raw_times": [0.053131000015582686, 0.054602000034265075, 0.052391999986411975, 0.05286200001819452, 0.05292200000894809], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05755099999760205, "peak_bytes": 9438720, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S128_W2", "batch": 4, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05195099998900332, "p50": 0.05479100002503401, "p90": 0.05620100000669481, "mean": 0.05852919999824735, "iqr": 0.0034000000255218765, "raw_times": [0.07690199998933167, 0.05620100000669481, 0.05479100002503401, 0.05195099998900332, 0.052800999981172936], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.057030999982998765, "peak_bytes": 18931712, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S128_W4", "batch": 4, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05290100000365783, "p50": 0.05457100002104198, "p90": 0.055251000048883725, "mean": 0.055353400011881604, "iqr": 0.001779000058377278, "raw_times": [0.05290100000365783, 0.06057199999531804, 0.055251000048883725, 0.05347199999050645, 0.05457100002104198], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056300999972336285, "peak_bytes": 18948096, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S512_W2", "batch": 4, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052152000023397704, "p50": 0.05461199998535449, "p90": 0.05471200000783938, "mean": 0.05381760001910152, "iqr": 0.001900999961890193, "raw_times": [0.052152000023397704, 0.05461199998535449, 0.05480100003296684, 0.05471200000783938, 0.052811000045949186], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05758200001082514, "peak_bytes": 75522048, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S512_W4", "batch": 4, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05339099999446262, "p50": 0.05463100001179555, "p90": 0.055171999974845676, "mean": 0.05451339999353877, "iqr": 0.0011309999763398082, "raw_times": [0.05404099999850587, 0.055331999988084135, 0.05339099999446262, 0.05463100001179555, 0.055171999974845676], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.058501000012256554, "peak_bytes": 75538432, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S2048_W2", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05277100001421786, "p50": 0.053860999969401746, "p90": 0.054361000024982786, "mean": 0.053951200004576094, "iqr": 0.0007190000133050489, "raw_times": [0.05277100001421786, 0.05512100000260034, 0.05364200001167774, 0.053860999969401746, 0.054361000024982786], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.057851999997637904, "peak_bytes": 302014464, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S2048_W4", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05124100005104992, "p50": 0.053691000005073874, "p90": 0.054261999991922494, "mean": 0.05327740001348502, "iqr": 0.0014609999539061391, "raw_times": [0.05124100005104992, 0.05439199998136246, 0.054261999991922494, 0.053691000005073874, 0.052801000038016355], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05714199994599767, "peak_bytes": 302030848, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} diff --git a/causal_conv1d/impls/cells/benchmark.py b/causal_conv1d/impls/cells/benchmark.py index 2e38669a505cbdf181a93e97f31ed1e67ecf4883..725b12c4018e4eec05c5ddccb0c88a8eae6f150d 100644 --- a/causal_conv1d/impls/cells/benchmark.py +++ b/causal_conv1d/impls/cells/benchmark.py @@ -4,37 +4,28 @@ # "numpy", # "torch==2.8.0", # "kernels-benchmark-tools", +# "kernels", # ] # # [tool.uv.sources] # kernels-benchmark-tools = { path = "../../../../../tools", editable = true } # /// import torch -import torch.nn.functional as F import sys from kernels_benchmark_tools import KernelTypeEnum, run_benchmark +from kernels import get_kernel +# Load the causal conv1d kernel +causal_conv1d = get_kernel("kernels-community/causal-conv1d") -def torch_causal_conv1d(input_tensor, weight, bias): - # Convert to weight dtype for computation - x = input_tensor.to(weight.dtype) - dim = weight.shape[0] - width = weight.shape[1] - seqlen = input_tensor.shape[-1] - # Depthwise causal conv1d using PyTorch - out = F.conv1d(x, weight.unsqueeze(1), bias, padding=width - 1, groups=dim) - - # Truncate to original sequence length - out = out[..., :seqlen] - - # Convert back to original dtype - return out.to(input_tensor.dtype) +def hf_kernels_causal_conv1d(input_tensor, weight, bias): + return causal_conv1d.causal_conv1d_fn(input_tensor, weight, bias) run_benchmark( kernel_type=KernelTypeEnum.CAUSAL_CONV1D, - impl_name="torch_eager", - impl_tags={"family": "pytorch", "backend": "eager"}, - impl_func=torch_causal_conv1d, + impl_name="hf_kernels_causal_conv1d", + impl_tags={"family": "hf-kernels", "backend": "cuda"}, + impl_func=hf_kernels_causal_conv1d, ) \ No newline at end of file diff --git a/causal_conv1d/impls/hf_kernels_causal_conv1d.html b/causal_conv1d/impls/hf_kernels_causal_conv1d.html index e50cedeff51b83afce46864a23939e763973b082..025d1f7d39597f6702f2ef95b801eca2a6d706e8 100644 --- a/causal_conv1d/impls/hf_kernels_causal_conv1d.html +++ b/causal_conv1d/impls/hf_kernels_causal_conv1d.html @@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: nv | 0.21s +Cell: nv | 0.24s | Raw @@ -3887,7 +3887,7 @@ Cell: nv | 0.21s
-
Tue Oct 28 14:08:09 2025       
+
Wed Oct 29 14:27:09 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -3896,7 +3896,7 @@ Cell: nv | 0.21s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   28C    P0             80W /  350W |       0MiB /  46068MiB |     19%      Default |
+| N/A   33C    P0            109W /  350W |       0MiB /  46068MiB |    100%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3920,7 +3920,7 @@ Cell: nv | 0.21s
 ▼ output
  ▶ uv-logs
  | 
-Cell: benchmark | 9.91s
+Cell: benchmark | 5.79s
  | 
 
 Raw
@@ -3973,19 +3973,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     153.312us      3772.44%     153.312us     153.312us             1  
-                               hf_kernels_causal_conv1d         8.26%     153.696us        99.59%       1.854ms       1.854ms       0.000us         0.00%       5.504us       5.504us             1  
-                                         CausalConv1dFn         6.06%     112.844us        91.33%       1.700ms     566.616us       0.000us         0.00%       5.504us       1.835us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.41%      26.281us        81.37%       1.514ms     504.821us       4.064us       100.00%       5.504us       1.835us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.064us       100.00%       4.064us       1.355us             3  
-                                Activity Buffer Request        77.27%       1.438ms        77.27%       1.438ms       1.438ms       1.440us        35.43%       1.440us       1.440us             1  
-                                       aten::empty_like         1.15%      21.339us         3.90%      72.543us      24.181us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         2.75%      51.204us         2.75%      51.204us      17.068us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         2.69%      50.001us         2.69%      50.001us      16.667us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.41%       7.700us         0.41%       7.700us       7.700us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     151.393us      3724.31%     151.393us     151.393us             1  
+                               hf_kernels_causal_conv1d         8.95%     166.324us        99.62%       1.852ms       1.852ms       0.000us         0.00%       5.505us       5.505us             1  
+                                         CausalConv1dFn         6.05%     112.563us        90.67%       1.686ms     561.934us       0.000us         0.00%       5.505us       1.835us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.41%      26.172us        80.97%       1.505ms     501.826us       4.065us       100.00%       5.505us       1.835us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.065us       100.00%       4.065us       1.355us             3  
+                                Activity Buffer Request        77.14%       1.434ms        77.14%       1.434ms       1.434ms       1.440us        35.42%       1.440us       1.440us             1  
+                                       aten::empty_like         1.03%      19.059us         3.64%      67.761us      22.587us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         2.62%      48.702us         2.62%      48.702us      16.234us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         2.42%      45.061us         2.42%      45.061us      15.020us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.38%       7.150us         0.38%       7.150us       7.150us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.861ms
-Self CUDA time total: 4.064us
+Self CPU time total: 1.859ms
+Self CUDA time total: 4.065us
 
 
 
@@ -3995,19 +3995,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     128.895us      3412.63%     128.895us     128.895us             1  
-                               hf_kernels_causal_conv1d         5.00%      84.832us        99.68%       1.692ms       1.692ms       0.000us         0.00%       5.026us       5.026us             1  
-                                         CausalConv1dFn         4.43%      75.123us        94.68%       1.607ms     535.685us       0.000us         0.00%       5.026us       1.675us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.59%      27.059us        88.41%       1.501ms     500.224us       3.777us       100.00%       5.026us       1.675us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.777us       100.00%       3.777us       1.259us             3  
-                                Activity Buffer Request        84.88%       1.441ms        84.88%       1.441ms       1.441ms       1.249us        33.07%       1.249us       1.249us             1  
-                                       aten::empty_like         0.54%       9.230us         1.84%      31.262us      10.421us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.30%      22.032us         1.30%      22.032us       7.344us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         1.94%      32.892us         1.94%      32.892us      10.964us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.32%       5.440us         0.32%       5.440us       5.440us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     129.439us      3456.32%     129.439us     129.439us             1  
+                               hf_kernels_causal_conv1d         5.79%      99.043us        99.68%       1.706ms       1.706ms       0.000us         0.00%       4.994us       4.994us             1  
+                                         CausalConv1dFn         4.71%      80.562us        93.90%       1.607ms     535.793us       0.000us         0.00%       4.994us       1.665us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.47%      25.130us        87.50%       1.498ms     499.285us       3.745us       100.00%       4.994us       1.665us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.745us       100.00%       3.745us       1.248us             3  
+                                Activity Buffer Request        84.17%       1.441ms        84.17%       1.441ms       1.441ms       1.249us        33.35%       1.249us       1.249us             1  
+                                       aten::empty_like         0.47%       7.980us         1.69%      28.961us       9.654us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.23%      20.981us         1.23%      20.981us       6.994us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.86%      31.821us         1.86%      31.821us      10.607us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.32%       5.430us         0.32%       5.430us       5.430us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.697ms
-Self CUDA time total: 3.777us
+Self CPU time total: 1.712ms
+Self CUDA time total: 3.745us
 
 
 
@@ -4017,19 +4017,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     124.670us      3273.90%     124.670us     124.670us             1  
-                               hf_kernels_causal_conv1d         4.86%      81.824us        99.65%       1.679ms       1.679ms       0.000us         0.00%       5.056us       5.056us             1  
-                                         CausalConv1dFn         4.28%      72.081us        94.80%       1.598ms     532.512us       0.000us         0.00%       5.056us       1.685us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.53%      25.732us        88.63%       1.494ms     497.871us       3.808us       100.00%       5.056us       1.685us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.808us       100.00%       3.808us       1.269us             3  
-                                Activity Buffer Request        85.15%       1.435ms        85.15%       1.435ms       1.435ms       1.248us        32.77%       1.248us       1.248us             1  
-                                       aten::empty_like         0.59%       9.910us         1.89%      31.841us      10.614us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.30%      21.931us         1.30%      21.931us       7.310us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         1.96%      32.960us         1.96%      32.960us      10.987us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.35%       5.830us         0.35%       5.830us       5.830us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     124.098us      3285.62%     124.098us     124.098us             1  
+                               hf_kernels_causal_conv1d         5.52%      95.683us        99.69%       1.728ms       1.728ms       0.000us         0.00%       5.057us       5.057us             1  
+                                         CausalConv1dFn         4.48%      77.582us        94.17%       1.632ms     544.020us       0.000us         0.00%       5.057us       1.686us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.43%      24.830us        87.99%       1.525ms     508.322us       3.777us       100.00%       5.057us       1.686us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.777us       100.00%       3.777us       1.259us             3  
+                                Activity Buffer Request        84.76%       1.469ms        84.76%       1.469ms       1.469ms       1.280us        33.89%       1.280us       1.280us             1  
+                                       aten::empty_like         0.46%       7.920us         1.70%      29.511us       9.837us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.25%      21.591us         1.25%      21.591us       7.197us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.80%      31.261us         1.80%      31.261us      10.420us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.31%       5.301us         0.31%       5.301us       5.301us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.685ms
-Self CUDA time total: 3.808us
+Self CPU time total: 1.733ms
+Self CUDA time total: 3.777us
 
 
 
@@ -4039,19 +4039,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     131.358us      3479.68%     131.358us     131.358us             1  
-                               hf_kernels_causal_conv1d         4.44%      83.422us        99.71%       1.875ms       1.875ms       0.000us         0.00%       5.054us       5.054us             1  
-                                         CausalConv1dFn         4.02%      75.643us        95.28%       1.792ms     597.348us       0.000us         0.00%       5.054us       1.685us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.36%      25.501us        89.54%       1.684ms     561.363us       3.775us       100.00%       5.054us       1.685us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.775us       100.00%       3.775us       1.258us             3  
-                                Activity Buffer Request        75.66%       1.423ms        75.66%       1.423ms       1.423ms       1.279us        33.88%       1.279us       1.279us             1  
-                                       aten::empty_like         0.55%      10.279us         1.72%      32.311us      10.770us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.17%      22.032us         1.17%      22.032us       7.344us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        12.52%     235.449us        12.52%     235.449us      78.483us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.29%       5.400us         0.29%       5.400us       5.400us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     129.729us      3378.36%     129.729us     129.729us             1  
+                               hf_kernels_causal_conv1d         5.03%      97.232us        99.72%       1.927ms       1.927ms       0.000us         0.00%       5.120us       5.120us             1  
+                                         CausalConv1dFn         4.11%      79.452us        94.69%       1.830ms     610.049us       0.000us         0.00%       5.120us       1.707us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.27%      24.481us        89.03%       1.721ms     573.588us       3.840us       100.00%       5.120us       1.707us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.840us       100.00%       3.840us       1.280us             3  
+                                Activity Buffer Request        76.40%       1.477ms        76.40%       1.477ms       1.477ms       1.280us        33.33%       1.280us       1.280us             1  
+                                       aten::empty_like         0.41%       7.951us         1.55%      29.931us       9.977us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.14%      21.980us         1.14%      21.980us       7.327us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        11.36%     219.575us        11.36%     219.575us      73.192us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.28%       5.490us         0.28%       5.490us       5.490us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.881ms
-Self CUDA time total: 3.775us
+Self CPU time total: 1.933ms
+Self CUDA time total: 3.840us
 
 
 
@@ -4061,19 +4061,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     129.694us      2701.96%     129.694us     129.694us             1  
-                               hf_kernels_causal_conv1d         4.57%      82.923us        99.70%       1.809ms       1.809ms       0.000us         0.00%       6.432us       6.432us             1  
-                                         CausalConv1dFn         4.25%      77.065us        95.13%       1.727ms     575.517us       0.000us         0.00%       6.432us       2.144us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.43%      25.889us        89.13%       1.618ms     539.172us       4.800us       100.00%       6.432us       2.144us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.800us       100.00%       4.800us       1.600us             3  
-                                Activity Buffer Request        78.67%       1.428ms        78.67%       1.428ms       1.428ms       1.632us        34.00%       1.632us       1.632us             1  
-                                       aten::empty_like         0.53%       9.690us         1.76%      31.970us      10.657us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.23%      22.280us         1.23%      22.280us       7.427us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         9.03%     163.837us         9.03%     163.837us      54.612us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.30%       5.391us         0.30%       5.391us       5.391us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     126.080us      2644.30%     126.080us     126.080us             1  
+                               hf_kernels_causal_conv1d         5.18%     102.863us        99.75%       1.979ms       1.979ms       0.000us         0.00%       6.368us       6.368us             1  
+                                         CausalConv1dFn         3.95%      78.303us        94.57%       1.876ms     625.402us       0.000us         0.00%       6.368us       2.123us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.22%      24.140us        89.14%       1.768ms     589.491us       4.768us       100.00%       6.368us       2.123us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.768us       100.00%       4.768us       1.589us             3  
+                                Activity Buffer Request        79.49%       1.577ms        79.49%       1.577ms       1.577ms       1.600us        33.56%       1.600us       1.600us             1  
+                                       aten::empty_like         0.40%       7.900us         1.48%      29.430us       9.810us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.09%      21.530us         1.09%      21.530us       7.177us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         8.43%     167.184us         8.43%     167.184us      55.728us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.25%       4.910us         0.25%       4.910us       4.910us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.815ms
-Self CUDA time total: 4.800us
+Self CPU time total: 1.984ms
+Self CUDA time total: 4.768us
 
 
 
@@ -4083,19 +4083,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     118.655us      2439.95%     118.655us     118.655us             1  
-                               hf_kernels_causal_conv1d        15.62%      77.102us        98.87%     488.177us     488.177us       0.000us         0.00%       6.495us       6.495us             1  
-                                         CausalConv1dFn        14.62%      72.193us        83.25%     411.075us     137.025us       0.000us         0.00%       6.495us       2.165us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.27%      26.040us        62.53%     308.751us     102.917us       4.863us       100.00%       6.495us       2.165us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.863us       100.00%       4.863us       1.621us             3  
-                                Activity Buffer Request        25.28%     124.815us        25.28%     124.815us     124.815us       1.632us        33.56%       1.632us       1.632us             1  
-                                       aten::empty_like         1.61%       7.949us         6.10%      30.131us      10.044us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.49%      22.182us         4.49%      22.182us       7.394us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        31.98%     157.896us        31.98%     157.896us      52.632us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.13%       5.580us         1.13%       5.580us       5.580us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     121.055us      2488.80%     121.055us     121.055us             1  
+                               hf_kernels_causal_conv1d        13.09%      78.123us        99.20%     592.205us     592.205us       0.000us         0.00%       6.528us       6.528us             1  
+                                         CausalConv1dFn        13.01%      77.643us        86.11%     514.082us     171.361us       0.000us         0.00%       6.528us       2.176us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.18%      24.929us        68.36%     408.089us     136.030us       4.864us       100.00%       6.528us       2.176us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.864us       100.00%       4.864us       1.621us             3  
+                                Activity Buffer Request        36.63%     218.665us        36.63%     218.665us     218.665us       1.664us        34.21%       1.664us       1.664us             1  
+                                       aten::empty_like         1.31%       7.839us         4.75%      28.350us       9.450us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.44%      20.511us         3.44%      20.511us       6.837us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        27.55%     164.495us        27.55%     164.495us      54.832us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.80%       4.790us         0.80%       4.790us       4.790us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 493.757us
-Self CUDA time total: 4.863us
+Self CPU time total: 596.995us
+Self CUDA time total: 4.864us
 
 
 
@@ -4105,19 +4105,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     126.463us      1179.69%     126.463us     126.463us             1  
-                               hf_kernels_causal_conv1d         4.44%      79.793us        99.69%       1.793ms       1.793ms       0.000us         0.00%      14.304us      14.304us             1  
-                                         CausalConv1dFn         3.96%      71.252us        95.25%       1.713ms     571.037us       0.000us         0.00%      14.304us       4.768us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.37%      24.661us        89.51%       1.610ms     536.652us      10.720us       100.00%      14.304us       4.768us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.720us       100.00%      10.720us       3.573us             3  
-                                Activity Buffer Request        79.30%       1.426ms        79.30%       1.426ms       1.426ms       3.584us        33.43%       3.584us       3.584us             1  
-                                       aten::empty_like         0.54%       9.750us         1.77%      31.901us      10.634us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.23%      22.151us         1.23%      22.151us       7.384us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         8.84%     159.036us         8.84%     159.036us      53.012us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.31%       5.660us         0.31%       5.660us       5.660us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     128.031us      1201.49%     128.031us     128.031us             1  
+                               hf_kernels_causal_conv1d         5.58%     105.873us        99.72%       1.893ms       1.893ms       0.000us         0.00%      14.208us      14.208us             1  
+                                         CausalConv1dFn         4.13%      78.341us        94.14%       1.787ms     595.748us       0.000us         0.00%      14.208us       4.736us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.45%      27.570us        88.49%       1.680ms     559.957us      10.656us       100.00%      14.208us       4.736us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.656us       100.00%      10.656us       3.552us             3  
+                                Activity Buffer Request        77.94%       1.480ms        77.94%       1.480ms       1.480ms       3.552us        33.33%       3.552us       3.552us             1  
+                                       aten::empty_like         0.41%       7.812us         1.53%      29.032us       9.677us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.12%      21.220us         1.12%      21.220us       7.073us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         9.09%     172.624us         9.09%     172.624us      57.541us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.28%       5.330us         0.28%       5.330us       5.330us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.799ms
-Self CUDA time total: 10.720us
+Self CPU time total: 1.898ms
+Self CUDA time total: 10.656us
 
 
 
@@ -4127,19 +4127,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     122.490us      1115.98%     122.490us     122.490us             1  
-                               hf_kernels_causal_conv1d        17.58%      82.141us        98.94%     462.145us     462.145us       0.000us         0.00%      14.656us      14.656us             1  
-                                         CausalConv1dFn        15.46%      72.195us        81.35%     380.004us     126.668us       0.000us         0.00%      14.656us       4.885us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.51%      25.720us        59.56%     278.229us      92.743us      10.976us       100.00%      14.656us       4.885us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.976us       100.00%      10.976us       3.659us             3  
-                                Activity Buffer Request        20.67%      96.553us        20.67%      96.553us      96.553us       3.680us        33.53%       3.680us       3.680us             1  
-                                       aten::empty_like         1.79%       8.340us         6.33%      29.580us       9.860us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.55%      21.240us         4.55%      21.240us       7.080us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        33.39%     155.956us        33.39%     155.956us      51.985us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.06%       4.970us         1.06%       4.970us       4.970us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     122.524us      1119.66%     122.524us     122.524us             1  
+                               hf_kernels_causal_conv1d        19.00%     100.263us        99.02%     522.563us     522.563us       0.000us         0.00%      14.623us      14.623us             1  
+                                         CausalConv1dFn        14.56%      76.813us        80.02%     422.300us     140.767us       0.000us         0.00%      14.623us       4.874us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.04%      26.621us        60.06%     316.927us     105.642us      10.943us       100.00%      14.623us       4.874us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.943us       100.00%      10.943us       3.648us             3  
+                                Activity Buffer Request        24.63%     129.993us        24.63%     129.993us     129.993us       3.680us        33.63%       3.680us       3.680us             1  
+                                       aten::empty_like         1.53%       8.070us         5.41%      28.560us       9.520us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.88%      20.490us         3.88%      20.490us       6.830us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        30.38%     160.313us        30.38%     160.313us      53.438us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.98%       5.160us         0.98%       5.160us       5.160us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 467.115us
-Self CUDA time total: 10.976us
+Self CPU time total: 527.723us
+Self CUDA time total: 10.943us
 
 
 
@@ -4149,18 +4149,18 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     128.671us      1165.50%     128.671us     128.671us             1  
-                               hf_kernels_causal_conv1d         4.51%      81.351us        99.72%       1.798ms       1.798ms       0.000us         0.00%      14.784us      14.784us             1  
-                                         CausalConv1dFn         4.05%      73.093us        95.21%       1.717ms     572.174us       0.000us         0.00%      14.784us       4.928us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.34%      24.081us        89.39%       1.612ms     537.183us      11.040us       100.00%      14.784us       4.928us             3  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     130.879us      1185.50%     130.879us     130.879us             1  
+                               hf_kernels_causal_conv1d         6.10%     112.423us        99.71%       1.839ms       1.839ms       0.000us         0.00%      14.752us      14.752us             1  
+                                         CausalConv1dFn         4.42%      81.553us        93.62%       1.726ms     575.457us       0.000us         0.00%      14.752us       4.917us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.34%      24.629us        87.45%       1.613ms     537.533us      11.040us       100.00%      14.752us       4.917us             3  
 void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      11.040us       100.00%      11.040us       3.680us             3  
-                                Activity Buffer Request        79.34%       1.430ms        79.34%       1.430ms       1.430ms       3.744us        33.91%       3.744us       3.744us             1  
-                                       aten::empty_like         0.49%       8.921us         1.77%      31.881us      10.627us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.27%      22.960us         1.27%      22.960us       7.653us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         8.72%     157.177us         8.72%     157.177us      52.392us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.28%       4.970us         0.28%       4.970us       4.970us       0.000us         0.00%       0.000us       0.000us             1  
+                                Activity Buffer Request        77.44%       1.428ms        77.44%       1.428ms       1.428ms       3.712us        33.62%       3.712us       3.712us             1  
+                                       aten::empty_like         0.46%       8.560us         1.75%      32.220us      10.740us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.28%      23.660us         1.28%      23.660us       7.887us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         8.67%     159.915us         8.67%     159.915us      53.305us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.29%       5.260us         0.29%       5.260us       5.260us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.803ms
+Self CPU time total: 1.844ms
 Self CUDA time total: 11.040us
 
 
@@ -4171,19 +4171,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     125.762us      1085.65%     125.762us     125.762us             1  
-                               hf_kernels_causal_conv1d        16.83%      79.002us        98.82%     463.887us     463.887us       0.000us         0.00%      15.360us      15.360us             1  
-                                         CausalConv1dFn        15.62%      73.323us        81.99%     384.885us     128.295us       0.000us         0.00%      15.360us       5.120us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.37%      25.230us        59.95%     281.430us      93.810us      11.584us       100.00%      15.360us       5.120us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      11.584us       100.00%      11.584us       3.861us             3  
-                                Activity Buffer Request        20.79%      97.593us        20.79%      97.593us      97.593us       3.776us        32.60%       3.776us       3.776us             1  
-                                       aten::empty_like         1.82%       8.531us         6.42%      30.132us      10.044us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.60%      21.601us         4.60%      21.601us       7.200us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        33.79%     158.607us        33.79%     158.607us      52.869us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.18%       5.530us         1.18%       5.530us       5.530us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     124.988us      1097.16%     124.988us     124.988us             1  
+                               hf_kernels_causal_conv1d        14.68%      75.042us        98.95%     505.802us     505.802us       0.000us         0.00%      15.232us      15.232us             1  
+                                         CausalConv1dFn        15.20%      77.712us        84.27%     430.760us     143.587us       0.000us         0.00%      15.232us       5.077us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.71%      24.091us        63.54%     324.777us     108.259us      11.392us       100.00%      15.232us       5.077us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      11.392us       100.00%      11.392us       3.797us             3  
+                                Activity Buffer Request        26.66%     136.263us        26.66%     136.263us     136.263us       3.840us        33.71%       3.840us       3.840us             1  
+                                       aten::empty_like         1.46%       7.441us         5.53%      28.271us       9.424us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.08%      20.830us         4.08%      20.830us       6.943us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        32.17%     164.423us        32.17%     164.423us      54.808us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.05%       5.351us         1.05%       5.351us       5.351us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 469.417us
-Self CUDA time total: 11.584us
+Self CPU time total: 511.153us
+Self CUDA time total: 11.392us
 
 
 
@@ -4193,19 +4193,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     134.046us       264.80%     134.046us     134.046us             1  
-                               hf_kernels_causal_conv1d         4.19%      76.942us        99.71%       1.832ms       1.832ms       0.000us         0.00%      84.285us      84.285us             1  
-                                         CausalConv1dFn         4.10%      75.381us        95.52%       1.755ms     585.044us       0.000us         0.00%      84.285us      28.095us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.30%      23.952us        89.70%       1.648ms     549.413us      50.622us       100.00%      84.285us      28.095us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      50.622us       100.00%      50.622us      16.874us             3  
-                                Activity Buffer Request        78.71%       1.446ms        78.71%       1.446ms       1.446ms      33.663us        66.50%      33.663us      33.663us             1  
-                                       aten::empty_like         0.54%       9.991us         1.71%      31.512us      10.504us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.17%      21.521us         1.17%      21.521us       7.174us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         9.69%     177.966us         9.69%     177.966us      59.322us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.29%       5.380us         0.29%       5.380us       5.380us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     131.775us       262.12%     131.775us     131.775us             1  
+                               hf_kernels_causal_conv1d         8.81%      77.263us        99.39%     871.362us     871.362us       0.000us         0.00%      83.680us      83.680us             1  
+                                         CausalConv1dFn         8.68%      76.121us        90.57%     794.099us     264.700us       0.000us         0.00%      83.680us      27.893us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         3.02%      26.501us        78.58%     688.947us     229.649us      50.272us       100.00%      83.680us      27.893us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      50.272us       100.00%      50.272us      16.757us             3  
+                                Activity Buffer Request        55.77%     488.972us        55.77%     488.972us     488.972us      33.408us        66.45%      33.408us      33.408us             1  
+                                       aten::empty_like         0.92%       8.040us         3.31%      29.031us       9.677us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         2.39%      20.991us         2.39%      20.991us       6.997us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        19.79%     173.474us        19.79%     173.474us      57.825us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.61%       5.370us         0.61%       5.370us       5.370us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.837ms
-Self CUDA time total: 50.622us
+Self CPU time total: 876.732us
+Self CUDA time total: 50.272us
 
 
 
@@ -4215,19 +4215,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     124.639us       241.17%     124.639us     124.639us             1  
-                               hf_kernels_causal_conv1d        12.15%      73.652us        99.08%     600.632us     600.632us       0.000us         0.00%      86.272us      86.272us             1  
-                                         CausalConv1dFn        11.76%      71.283us        86.93%     526.980us     175.660us       0.000us         0.00%      86.272us      28.757us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.05%      24.580us        70.27%     425.965us     141.988us      51.680us       100.00%      86.272us      28.757us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      51.680us       100.00%      51.680us      17.227us             3  
-                                Activity Buffer Request        38.62%     234.139us        38.62%     234.139us     234.139us      34.592us        66.93%      34.592us      34.592us             1  
-                                       aten::empty_like         1.31%       7.952us         4.90%      29.732us       9.911us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         3.59%      21.780us         3.59%      21.780us       7.260us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        27.59%     167.246us        27.59%     167.246us      55.749us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.92%       5.560us         0.92%       5.560us       5.560us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     127.295us       247.23%     127.295us     127.295us             1  
+                               hf_kernels_causal_conv1d        15.09%      77.332us        99.04%     507.562us     507.562us       0.000us         0.00%      86.016us      86.016us             1  
+                                         CausalConv1dFn        14.68%      75.241us        83.95%     430.230us     143.410us       0.000us         0.00%      86.016us      28.672us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.05%      25.861us        63.40%     324.927us     108.309us      51.488us       100.00%      86.016us      28.672us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      51.488us       100.00%      51.488us      17.163us             3  
+                                Activity Buffer Request        25.26%     129.463us        25.26%     129.463us     129.463us      34.528us        67.06%      34.528us      34.528us             1  
+                                       aten::empty_like         1.67%       8.561us         5.87%      30.062us      10.021us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.20%      21.501us         4.20%      21.501us       7.167us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        33.09%     169.603us        33.09%     169.603us      56.534us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.96%       4.929us         0.96%       4.929us       4.929us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 606.192us
-Self CUDA time total: 51.680us
+Self CPU time total: 512.491us
+Self CUDA time total: 51.488us
 
 
 
@@ -4237,18 +4237,18 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     117.184us      3001.64%     117.184us     117.184us             1  
-                               hf_kernels_causal_conv1d        11.99%      71.634us        99.07%     591.661us     591.661us       0.000us         0.00%       5.152us       5.152us             1  
-                                         CausalConv1dFn        11.65%      69.552us        87.08%     520.027us     173.342us       0.000us         0.00%       5.152us       1.717us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.09%      24.400us        70.30%     419.834us     139.945us       3.904us       100.00%       5.152us       1.717us             3  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     121.214us      3104.87%     121.214us     121.214us             1  
+                               hf_kernels_causal_conv1d         8.71%      75.123us        99.37%     856.672us     856.672us       0.000us         0.00%       5.184us       5.184us             1  
+                                         CausalConv1dFn         8.55%      73.741us        90.66%     781.549us     260.516us       0.000us         0.00%       5.184us       1.728us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         2.92%      25.150us        78.63%     677.857us     225.952us       3.904us       100.00%       5.184us       1.728us             3  
 void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.904us       100.00%       3.904us       1.301us             3  
-                                Activity Buffer Request        39.52%     236.029us        39.52%     236.029us     236.029us       1.248us        31.97%       1.248us       1.248us             1  
-                                       aten::empty_like         1.39%       8.281us         5.13%      30.641us      10.214us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         3.74%      22.360us         3.74%      22.360us       7.453us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        26.69%     159.405us        26.69%     159.405us      53.135us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.93%       5.550us         0.93%       5.550us       5.550us       0.000us         0.00%       0.000us       0.000us             1  
+                                Activity Buffer Request        56.24%     484.832us        56.24%     484.832us     484.832us       1.280us        32.79%       1.280us       1.280us             1  
+                                       aten::empty_like         1.08%       9.311us         3.47%      29.951us       9.984us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         2.39%      20.640us         2.39%      20.640us       6.880us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        19.47%     167.875us        19.47%     167.875us      55.958us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.63%       5.440us         0.63%       5.440us       5.440us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 597.211us
+Self CPU time total: 862.112us
 Self CUDA time total: 3.904us
 
 
@@ -4259,19 +4259,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     129.214us      3308.94%     129.214us     129.214us             1  
-                               hf_kernels_causal_conv1d        14.44%      74.841us        98.93%     512.678us     512.678us       0.000us         0.00%       5.154us       5.154us             1  
-                                         CausalConv1dFn        14.14%      73.283us        84.49%     437.837us     145.946us       0.000us         0.00%       5.154us       1.718us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         6.57%      34.031us        64.55%     334.472us     111.491us       3.905us       100.00%       5.154us       1.718us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.905us       100.00%       3.905us       1.302us             3  
-                                Activity Buffer Request        27.83%     144.225us        27.83%     144.225us     144.225us       1.249us        31.98%       1.249us       1.249us             1  
-                                       aten::empty_like         1.69%       8.750us         5.81%      30.082us      10.027us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.12%      21.332us         4.12%      21.332us       7.111us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        30.15%     156.216us        30.15%     156.216us      52.072us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.07%       5.520us         1.07%       5.520us       5.520us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     121.438us      3086.10%     121.438us     121.438us             1  
+                               hf_kernels_causal_conv1d        15.37%      74.422us        98.89%     478.921us     478.921us       0.000us         0.00%       5.183us       5.183us             1  
+                                         CausalConv1dFn        15.69%      75.972us        83.52%     404.499us     134.833us       0.000us         0.00%       5.183us       1.728us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.44%      26.330us        61.72%     298.936us      99.645us       3.935us       100.00%       5.183us       1.728us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.935us       100.00%       3.935us       1.312us             3  
+                                Activity Buffer Request        23.74%     114.963us        23.74%     114.963us     114.963us       1.248us        31.72%       1.248us       1.248us             1  
+                                       aten::empty_like         1.57%       7.609us         6.11%      29.591us       9.864us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.54%      21.982us         4.54%      21.982us       7.327us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        32.55%     157.643us        32.55%     157.643us      52.548us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.11%       5.391us         1.11%       5.391us       5.391us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 518.198us
-Self CUDA time total: 3.905us
+Self CPU time total: 484.312us
+Self CUDA time total: 3.935us
 
 
 
@@ -4281,19 +4281,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     118.525us      2939.61%     118.525us     118.525us             1  
-                               hf_kernels_causal_conv1d        13.97%      75.404us        99.13%     534.960us     534.960us       0.000us         0.00%       5.376us       5.376us             1  
-                                         CausalConv1dFn        13.10%      70.683us        85.16%     459.556us     153.185us       0.000us         0.00%       5.376us       1.792us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.73%      25.549us        66.42%     358.442us     119.481us       4.032us       100.00%       5.376us       1.792us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.032us       100.00%       4.032us       1.344us             3  
-                                Activity Buffer Request        32.81%     177.046us        32.81%     177.046us     177.046us       1.344us        33.33%       1.344us       1.344us             1  
-                                       aten::empty_like         1.62%       8.721us         5.64%      30.431us      10.144us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.02%      21.710us         4.02%      21.710us       7.237us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        28.88%     155.847us        28.88%     155.847us      51.949us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.87%       4.710us         0.87%       4.710us       4.710us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     152.157us      3744.94%     152.157us     152.157us             1  
+                               hf_kernels_causal_conv1d        10.88%      77.931us        99.21%     710.327us     710.327us       0.000us         0.00%       5.407us       5.407us             1  
+                                         CausalConv1dFn        11.39%      81.522us        88.32%     632.396us     210.799us       0.000us         0.00%       5.407us       1.802us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         3.86%      27.639us        72.73%     520.742us     173.581us       4.063us       100.00%       5.407us       1.802us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.063us       100.00%       4.063us       1.354us             3  
+                                Activity Buffer Request        44.05%     315.408us        44.05%     315.408us     315.408us       1.344us        33.08%       1.344us       1.344us             1  
+                                       aten::empty_like         1.15%       8.200us         4.21%      30.132us      10.044us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.06%      21.932us         3.06%      21.932us       7.311us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        24.82%     177.695us        24.82%     177.695us      59.232us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.79%       5.681us         0.79%       5.681us       5.681us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 539.670us
-Self CUDA time total: 4.032us
+Self CPU time total: 716.008us
+Self CUDA time total: 4.063us
 
 
 
@@ -4303,19 +4303,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     115.905us      2852.70%     115.905us     115.905us             1  
-                               hf_kernels_causal_conv1d        16.16%      74.143us        98.83%     453.315us     453.315us       0.000us         0.00%       5.407us       5.407us             1  
-                                         CausalConv1dFn        14.93%      68.471us        82.67%     379.172us     126.391us       0.000us         0.00%       5.407us       1.802us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.63%      25.811us        61.32%     281.280us      93.760us       4.063us       100.00%       5.407us       1.802us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.063us       100.00%       4.063us       1.354us             3  
-                                Activity Buffer Request        21.83%     100.113us        21.83%     100.113us     100.113us       1.344us        33.08%       1.344us       1.344us             1  
-                                       aten::empty_like         1.88%       8.641us         6.41%      29.421us       9.807us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.53%      20.780us         4.53%      20.780us       6.927us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        33.87%     155.356us        33.87%     155.356us      51.785us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.17%       5.370us         1.17%       5.370us       5.370us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     119.936us      2951.18%     119.936us     119.936us             1  
+                               hf_kernels_causal_conv1d        15.86%      75.552us        99.00%     471.672us     471.672us       0.000us         0.00%       5.440us       5.440us             1  
+                                         CausalConv1dFn        16.03%      76.383us        83.14%     396.120us     132.040us       0.000us         0.00%       5.440us       1.813us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.35%      25.480us        61.26%     291.866us      97.289us       4.064us       100.00%       5.440us       1.813us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.064us       100.00%       4.064us       1.355us             3  
+                                Activity Buffer Request        23.14%     110.243us        23.14%     110.243us     110.243us       1.376us        33.86%       1.376us       1.376us             1  
+                                       aten::empty_like         1.53%       7.269us         5.85%      27.871us       9.290us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.32%      20.602us         4.32%      20.602us       6.867us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        32.77%     156.143us        32.77%     156.143us      52.048us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.00%       4.760us         1.00%       4.760us       4.760us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 458.685us
-Self CUDA time total: 4.063us
+Self CPU time total: 476.432us
+Self CUDA time total: 4.064us
 
 
 
@@ -4325,19 +4325,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     122.141us      2271.97%     122.141us     122.141us             1  
-                               hf_kernels_causal_conv1d        11.82%      75.911us        99.15%     636.712us     636.712us       0.000us         0.00%       7.200us       7.200us             1  
-                                         CausalConv1dFn        11.01%      70.722us        87.33%     560.801us     186.934us       0.000us         0.00%       7.200us       2.400us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.24%      27.210us        71.66%     460.136us     153.379us       5.376us       100.00%       7.200us       2.400us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       5.376us       100.00%       5.376us       1.792us             3  
-                                Activity Buffer Request        43.06%     276.540us        43.06%     276.540us     276.540us       1.824us        33.93%       1.824us       1.824us             1  
-                                       aten::empty_like         1.25%       8.002us         4.66%      29.943us       9.981us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         3.42%      21.941us         3.42%      21.941us       7.314us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        24.35%     156.386us        24.35%     156.386us      52.129us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.85%       5.440us         0.85%       5.440us       5.440us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     129.888us      2401.78%     129.888us     129.888us             1  
+                               hf_kernels_causal_conv1d        13.50%     106.873us        99.32%     785.980us     785.980us       0.000us         0.00%       7.264us       7.264us             1  
+                                         CausalConv1dFn        10.04%      79.422us        85.81%     679.107us     226.369us       0.000us         0.00%       7.264us       2.421us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         3.32%      26.310us        72.10%     570.564us     190.188us       5.408us       100.00%       7.264us       2.421us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       5.408us       100.00%       5.408us       1.803us             3  
+                                Activity Buffer Request        48.81%     386.260us        48.81%     386.260us     386.260us       1.856us        34.32%       1.856us       1.856us             1  
+                                       aten::empty_like         1.01%       7.981us         3.68%      29.121us       9.707us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         2.67%      21.140us         2.67%      21.140us       7.047us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        19.96%     157.994us        19.96%     157.994us      52.665us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.68%       5.410us         0.68%       5.410us       5.410us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 642.152us
-Self CUDA time total: 5.376us
+Self CPU time total: 791.390us
+Self CUDA time total: 5.408us
 
 
 
@@ -4347,19 +4347,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     117.822us      2140.66%     117.822us     117.822us             1  
-                               hf_kernels_causal_conv1d        16.30%      72.964us        98.80%     442.326us     442.326us       0.000us         0.00%       7.392us       7.392us             1  
-                                         CausalConv1dFn        16.19%      72.472us        82.50%     369.362us     123.121us       0.000us         0.00%       7.392us       2.464us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.63%      25.211us        59.71%     267.319us      89.106us       5.504us       100.00%       7.392us       2.464us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       5.504us       100.00%       5.504us       1.835us             3  
-                                Activity Buffer Request        19.35%      86.632us        19.35%      86.632us      86.632us       1.888us        34.30%       1.888us       1.888us             1  
-                                       aten::empty_like         1.85%       8.281us         6.60%      29.571us       9.857us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.76%      21.290us         4.76%      21.290us       7.097us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        34.73%     155.476us        34.73%     155.476us      51.825us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.20%       5.391us         1.20%       5.391us       5.391us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     118.463us      2151.92%     118.463us     118.463us             1  
+                               hf_kernels_causal_conv1d        19.47%      96.181us        98.96%     488.812us     488.812us       0.000us         0.00%       7.393us       7.393us             1  
+                                         CausalConv1dFn        15.19%      75.044us        79.49%     392.631us     130.877us       0.000us         0.00%       7.393us       2.464us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.31%      26.241us        58.39%     288.397us      96.132us       5.505us       100.00%       7.393us       2.464us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       5.505us       100.00%       5.505us       1.835us             3  
+                                Activity Buffer Request        21.50%     106.222us        21.50%     106.222us     106.222us       1.888us        34.30%       1.888us       1.888us             1  
+                                       aten::empty_like         1.50%       7.390us         5.91%      29.190us       9.730us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.41%      21.800us         4.41%      21.800us       7.267us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        31.57%     155.934us        31.57%     155.934us      51.978us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.04%       5.140us         1.04%       5.140us       5.140us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 447.717us
-Self CUDA time total: 5.504us
+Self CPU time total: 493.952us
+Self CUDA time total: 5.505us
 
 
 
@@ -4369,19 +4369,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     125.728us       716.97%     125.728us     125.728us             1  
-                               hf_kernels_causal_conv1d        11.80%      75.821us        99.14%     637.002us     637.002us       0.000us         0.00%      23.392us      23.392us             1  
-                                         CausalConv1dFn        11.24%      72.243us        87.34%     561.181us     187.060us       0.000us         0.00%      23.392us       7.797us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.08%      26.210us        71.24%     457.746us     152.582us      17.536us       100.00%      23.392us       7.797us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.536us       100.00%      17.536us       5.845us             3  
-                                Activity Buffer Request        42.92%     275.770us        42.92%     275.770us     275.770us       5.856us        33.39%       5.856us       5.856us             1  
-                                       aten::empty_like         1.45%       9.311us         4.85%      31.192us      10.397us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         3.41%      21.881us         3.41%      21.881us       7.294us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        24.24%     155.766us        24.24%     155.766us      51.922us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.86%       5.550us         0.86%       5.550us       5.550us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     129.279us       741.28%     129.279us     129.279us             1  
+                               hf_kernels_causal_conv1d         5.08%      91.861us        99.73%       1.805ms       1.805ms       0.000us         0.00%      23.296us      23.296us             1  
+                                         CausalConv1dFn         4.24%      76.815us        94.65%       1.713ms     571.078us       0.000us         0.00%      23.296us       7.765us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.42%      25.791us        88.76%       1.607ms     535.516us      17.440us       100.00%      23.296us       7.765us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.440us       100.00%      17.440us       5.813us             3  
+                                Activity Buffer Request        78.65%       1.424ms        78.65%       1.424ms       1.424ms       5.856us        33.58%       5.856us       5.856us             1  
+                                       aten::empty_like         0.47%       8.500us         1.65%      29.870us       9.957us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.18%      21.370us         1.18%      21.370us       7.123us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         8.68%     157.163us         8.68%     157.163us      52.388us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.27%       4.911us         0.27%       4.911us       4.911us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 642.552us
-Self CUDA time total: 17.536us
+Self CPU time total: 1.810ms
+Self CUDA time total: 17.440us
 
 
 
@@ -4391,19 +4391,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     123.901us       690.22%     123.901us     123.901us             1  
-                               hf_kernels_causal_conv1d        16.99%      75.711us        98.78%     440.245us     440.245us       0.000us         0.00%      23.967us      23.967us             1  
-                                         CausalConv1dFn        15.81%      70.471us        81.79%     364.534us     121.511us       0.000us         0.00%      23.967us       7.989us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.65%      25.192us        59.40%     264.751us      88.250us      17.951us       100.00%      23.967us       7.989us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.951us       100.00%      17.951us       5.984us             3  
-                                Activity Buffer Request        18.53%      82.593us        18.53%      82.593us      82.593us       6.016us        33.51%       6.016us       6.016us             1  
-                                       aten::empty_like         1.75%       7.802us         6.58%      29.312us       9.771us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.83%      21.510us         4.83%      21.510us       7.170us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        35.22%     156.966us        35.22%     156.966us      52.322us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.22%       5.440us         1.22%       5.440us       5.440us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     139.324us       772.01%     139.324us     139.324us             1  
+                               hf_kernels_causal_conv1d        18.68%      93.362us        99.02%     494.883us     494.883us       0.000us         0.00%      24.095us      24.095us             1  
+                                         CausalConv1dFn        17.38%      86.843us        80.34%     401.521us     133.840us       0.000us         0.00%      24.095us       8.032us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.36%      26.789us        57.15%     285.628us      95.209us      18.047us       100.00%      24.095us       8.032us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      18.047us       100.00%      18.047us       6.016us             3  
+                                Activity Buffer Request        20.49%     102.403us        20.49%     102.403us     102.403us       6.048us        33.51%       6.048us       6.048us             1  
+                                       aten::empty_like         1.48%       7.399us         5.81%      29.050us       9.683us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.33%      21.651us         4.33%      21.651us       7.217us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        31.30%     156.436us        31.30%     156.436us      52.145us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.98%       4.890us         0.98%       4.890us       4.890us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 445.685us
-Self CUDA time total: 17.951us
+Self CPU time total: 499.773us
+Self CUDA time total: 18.047us
 
 
 
@@ -4413,19 +4413,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     131.804us       730.34%     131.804us     131.804us             1  
-                               hf_kernels_causal_conv1d        11.57%      77.592us        99.18%     665.133us     665.133us       0.000us         0.00%      24.094us      24.094us             1  
-                                         CausalConv1dFn        10.93%      73.321us        87.61%     587.541us     195.847us       0.000us         0.00%      24.094us       8.031us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         3.40%      22.811us        71.94%     482.478us     160.826us      18.047us       100.00%      24.094us       8.031us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      18.047us       100.00%      18.047us       6.016us             3  
-                                Activity Buffer Request        44.54%     298.731us        44.54%     298.731us     298.731us       6.047us        33.51%       6.047us       6.047us             1  
-                                       aten::empty_like         1.35%       9.049us         4.73%      31.742us      10.581us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         3.38%      22.693us         3.38%      22.693us       7.564us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        24.00%     160.936us        24.00%     160.936us      53.645us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.82%       5.510us         0.82%       5.510us       5.510us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     135.103us       748.58%     135.103us     135.103us             1  
+                               hf_kernels_causal_conv1d         5.37%      98.434us        99.69%       1.829ms       1.829ms       0.000us         0.00%      24.097us      24.097us             1  
+                                         CausalConv1dFn         4.35%      79.821us        94.33%       1.730ms     576.697us       0.000us         0.00%      24.097us       8.032us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.36%      24.912us        88.33%       1.620ms     540.010us      18.048us       100.00%      24.097us       8.032us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      18.048us       100.00%      18.048us       6.016us             3  
+                                Activity Buffer Request        77.78%       1.427ms        77.78%       1.427ms       1.427ms       6.049us        33.52%       6.049us       6.049us             1  
+                                       aten::empty_like         0.47%       8.550us         1.65%      30.240us      10.080us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.18%      21.690us         1.18%      21.690us       7.230us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         9.19%     168.514us         9.19%     168.514us      56.171us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.31%       5.620us         0.31%       5.620us       5.620us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 670.643us
-Self CUDA time total: 18.047us
+Self CPU time total: 1.834ms
+Self CUDA time total: 18.048us
 
 
 
@@ -4435,19 +4435,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     122.267us       637.87%     122.267us     122.267us             1  
-                               hf_kernels_causal_conv1d        16.94%      75.003us        98.82%     437.665us     437.665us       0.000us         0.00%      25.632us      25.632us             1  
-                                         CausalConv1dFn        15.90%      70.409us        81.89%     362.662us     120.887us       0.000us         0.00%      25.632us       8.544us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.97%      26.462us        59.15%     261.981us      87.327us      19.168us       100.00%      25.632us       8.544us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      19.168us       100.00%      19.168us       6.389us             3  
-                                Activity Buffer Request        18.04%      79.883us        18.04%      79.883us      79.883us       6.464us        33.72%       6.464us       6.464us             1  
-                                       aten::empty_like         2.06%       9.102us         6.84%      30.272us      10.091us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.78%      21.170us         4.78%      21.170us       7.057us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        35.14%     155.636us        35.14%     155.636us      51.879us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.18%       5.220us         1.18%       5.220us       5.220us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     130.684us       694.54%     130.684us     130.684us             1  
+                               hf_kernels_causal_conv1d        18.98%      97.223us        99.02%     507.183us     507.183us       0.000us         0.00%      25.120us      25.120us             1  
+                                         CausalConv1dFn        14.58%      74.692us        80.04%     409.960us     136.653us       0.000us         0.00%      25.120us       8.373us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         6.51%      33.321us        59.71%     305.838us     101.946us      18.816us       100.00%      25.120us       8.373us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      18.816us       100.00%      18.816us       6.272us             3  
+                                Activity Buffer Request        22.33%     114.353us        22.33%     114.353us     114.353us       6.304us        33.50%       6.304us       6.304us             1  
+                                       aten::empty_like         1.71%       8.769us         5.75%      29.430us       9.810us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.03%      20.661us         4.03%      20.661us       6.887us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        30.88%     158.164us        30.88%     158.164us      52.721us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.98%       5.010us         0.98%       5.010us       5.010us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 442.885us
-Self CUDA time total: 19.168us
+Self CPU time total: 512.193us
+Self CUDA time total: 18.816us
 
 
 
@@ -4457,19 +4457,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         4.25%      77.621us        99.69%       1.822ms       1.822ms       0.000us         0.00%     163.007us     163.007us             1  
-                                         CausalConv1dFn         4.18%      76.374us        95.44%       1.744ms     581.328us       0.000us         0.00%     163.007us      54.336us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.34%      24.550us        89.50%       1.636ms     545.169us      97.983us       100.00%     163.007us      54.336us             3  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     142.719us       145.66%     142.719us     142.719us             1  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      97.983us       100.00%      97.983us      32.661us             3  
-                                Activity Buffer Request        79.33%       1.450ms        79.33%       1.450ms       1.450ms      65.024us        66.36%      65.024us      65.024us             1  
-                                       aten::empty_like         0.51%       9.271us         1.76%      32.102us      10.701us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.25%      22.831us         1.25%      22.831us       7.610us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         8.83%     161.275us         8.83%     161.275us      53.758us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.31%       5.740us         0.31%       5.740us       5.740us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         6.14%     112.394us        99.70%       1.825ms       1.825ms       0.000us         0.00%     162.754us     162.754us             1  
+                                         CausalConv1dFn         4.41%      80.651us        93.56%       1.713ms     570.927us       0.000us         0.00%     162.754us      54.251us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.37%      25.010us        87.54%       1.603ms     534.193us      97.985us       100.00%     162.754us      54.251us             3  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     144.737us       147.71%     144.737us     144.737us             1  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      97.985us       100.00%      97.985us      32.662us             3  
+                                Activity Buffer Request        77.36%       1.416ms        77.36%       1.416ms       1.416ms      64.769us        66.10%      64.769us      64.769us             1  
+                                       aten::empty_like         0.49%       8.901us         1.61%      29.551us       9.850us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.13%      20.650us         1.13%      20.650us       6.883us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         8.82%     161.445us         8.82%     161.445us      53.815us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.30%       5.480us         0.30%       5.480us       5.480us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.827ms
-Self CUDA time total: 97.983us
+Self CPU time total: 1.831ms
+Self CUDA time total: 97.985us
 
 
 
@@ -4479,19 +4479,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d        17.00%      78.131us        98.89%     454.476us     454.476us       0.000us         0.00%     164.440us     164.440us             1  
-                                         CausalConv1dFn        15.89%      73.024us        81.89%     376.345us     125.448us       0.000us         0.00%     164.440us      54.813us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.76%      26.451us        59.63%     274.060us      91.353us      98.939us       100.00%     164.440us      54.813us             3  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     139.130us       140.62%     139.130us     139.130us             1  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      98.939us       100.00%      98.939us      32.980us             3  
-                                Activity Buffer Request        18.20%      83.643us        18.20%      83.643us      83.643us      65.501us        66.20%      65.501us      65.501us             1  
-                                       aten::empty_like         1.75%       8.030us         6.37%      29.261us       9.754us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.62%      21.231us         4.62%      21.231us       7.077us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        35.68%     163.966us        35.68%     163.966us      54.655us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.11%       5.111us         1.11%       5.111us       5.111us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d        19.17%      96.654us        98.90%     498.573us     498.573us       0.000us         0.00%     163.900us     163.900us             1  
+                                         CausalConv1dFn        15.33%      77.291us        79.73%     401.919us     133.973us       0.000us         0.00%     163.900us      54.633us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.17%      26.053us        58.73%     296.088us      98.696us      98.813us       100.00%     163.900us      54.633us             3  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     133.981us       135.59%     133.981us     133.981us             1  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      98.813us       100.00%      98.813us      32.938us             3  
+                                Activity Buffer Request        22.39%     112.882us        22.39%     112.882us     112.882us      65.087us        65.87%      65.087us      65.087us             1  
+                                       aten::empty_like         1.55%       7.820us         5.66%      28.540us       9.513us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.11%      20.720us         4.11%      20.720us       6.907us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        31.17%     157.153us        31.17%     157.153us      52.384us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.10%       5.550us         1.10%       5.550us       5.550us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 459.587us
-Self CUDA time total: 98.939us
+Self CPU time total: 504.123us
+Self CUDA time total: 98.813us
 
 
 impl                     wl                  p50(ms)  ok
@@ -4502,11 +4502,11 @@ hf_kernels_causal_conv1d cuda_B2_D2048_S2048_W4     0.05  True
 hf_kernels_causal_conv1d cuda_B2_D2048_S512_W2     0.05  True
 hf_kernels_causal_conv1d cuda_B2_D2048_S512_W4     0.05  True
 hf_kernels_causal_conv1d cuda_B2_D64_S128_W2     0.05  True
-hf_kernels_causal_conv1d cuda_B2_D64_S128_W4     0.05  True
-hf_kernels_causal_conv1d cuda_B2_D64_S2048_W2     0.05  True
+hf_kernels_causal_conv1d cuda_B2_D64_S128_W4     0.06  True
+hf_kernels_causal_conv1d cuda_B2_D64_S2048_W2     0.06  True
 hf_kernels_causal_conv1d cuda_B2_D64_S2048_W4     0.05  True
-hf_kernels_causal_conv1d cuda_B2_D64_S512_W2     0.05  True
-hf_kernels_causal_conv1d cuda_B2_D64_S512_W4     0.05  True
+hf_kernels_causal_conv1d cuda_B2_D64_S512_W2     0.06  True
+hf_kernels_causal_conv1d cuda_B2_D64_S512_W4     0.06  True
 hf_kernels_causal_conv1d cuda_B4_D2048_S128_W2     0.05  True
 hf_kernels_causal_conv1d cuda_B4_D2048_S128_W4     0.05  True
 hf_kernels_causal_conv1d cuda_B4_D2048_S2048_W2     0.05  True
@@ -4517,20 +4517,18 @@ hf_kernels_causal_conv1d cuda_B4_D64_S128_W2     0.05  True
 hf_kernels_causal_conv1d cuda_B4_D64_S128_W4     0.05  True
 hf_kernels_causal_conv1d cuda_B4_D64_S2048_W2     0.05  True
 hf_kernels_causal_conv1d cuda_B4_D64_S2048_W4     0.05  True
-hf_kernels_causal_conv1d cuda_B4_D64_S512_W2     0.05  True
+hf_kernels_causal_conv1d cuda_B4_D64_S512_W2     0.06  True
 hf_kernels_causal_conv1d cuda_B4_D64_S512_W4     0.05  True
 
▶ UV Install Logs
Fetching 11 files: 0%| | 0/11 [00:00<?, ?it/s] -Fetching 11 files: 64%|██████▎ | 7/11 [00:02<00:01, 3.26it/s] -Fetching 11 files: 100%|██████████| 11/11 [00:02<00:00, 5.12it/s]
+Fetching 11 files: 64%|██████▎ | 7/11 [00:01<00:01, 3.95it/s] +Fetching 11 files: 100%|██████████| 11/11 [00:01<00:00, 6.21it/s]

Artifacts:

causal_conv1d.jsonl diff --git a/causal_conv1d/impls/torch_causal_conv1d.html b/causal_conv1d/impls/torch_causal_conv1d.html index a14fe1d8732e839025c8dec1c927653b8a3a02ff..2dd29f110a68d2d6a2cb36ff92b20f1c54eab64b 100644 --- a/causal_conv1d/impls/torch_causal_conv1d.html +++ b/causal_conv1d/impls/torch_causal_conv1d.html @@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: nv | 0.21s +Cell: nv | 0.24s | Raw @@ -3887,7 +3887,7 @@ Cell: nv | 0.21s
-
Tue Oct 28 14:08:09 2025       
+
Wed Oct 29 14:27:09 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -3896,7 +3896,7 @@ Cell: nv | 0.21s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   28C    P0             80W /  350W |       0MiB /  46068MiB |     19%      Default |
+| N/A   33C    P0            109W /  350W |       0MiB /  46068MiB |    100%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3918,9 +3918,9 @@ Cell: nv | 0.21s
 
 ▼ code 
 ▼ output
- ▶ uv-logs
+ ▶ uv-logs
  | 
-Cell: benchmark | 3.63s
+Cell: benchmark | 7.23s
  | 
 
 Raw
@@ -3982,29 +3982,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     448.254us      2311.66%     448.254us     448.254us             1  
-                                            torch_eager        10.53%     223.197us        99.60%       2.112ms       2.112ms       0.000us         0.00%      21.727us      21.727us             1  
-                                               aten::to         0.57%      12.032us        79.33%       1.682ms     280.390us       0.000us         0.00%      14.304us       2.384us             6  
-                                         aten::_to_copy         1.82%      38.532us        78.77%       1.670ms     278.384us       0.000us         0.00%      14.304us       2.384us             6  
-                                            aten::copy_         2.94%      62.272us        74.35%       1.577ms     262.784us      11.968us        61.72%      14.304us       2.384us             6  
-                                           aten::conv1d         0.36%       7.640us         7.60%     161.165us      53.722us       0.000us         0.00%       7.423us       2.474us             3  
-                                      aten::convolution         0.68%      14.400us         7.24%     153.525us      51.175us       0.000us         0.00%       7.423us       2.474us             3  
-                                     aten::_convolution         1.64%      34.820us         6.56%     139.125us      46.375us       0.000us         0.00%       7.423us       2.474us             3  
-                                aten::_conv_depthwise2d         1.64%      34.779us         4.03%      85.503us      28.501us       7.423us        38.28%       7.423us       2.474us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.423us        38.28%       7.423us       2.474us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.304us        32.51%       6.304us       2.101us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.664us        29.21%       5.664us       1.888us             3  
-                                Activity Buffer Request        68.27%       1.448ms        68.27%       1.448ms       1.448ms       2.336us        12.05%       2.336us       2.336us             1  
-                                    aten::empty_strided         2.60%      55.071us         2.60%      55.071us       9.178us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         4.35%      92.254us         4.35%      92.254us      10.250us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         1.39%      29.522us         1.76%      37.262us       4.140us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.59%      12.410us         0.59%      12.410us       0.827us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.52%      10.960us         0.52%      10.960us       3.653us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.67%      14.291us         0.67%      14.291us       4.764us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.39%       8.321us         0.47%       9.881us       3.294us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     465.824us      2410.10%     465.824us     465.824us             1  
+                                            torch_eager        10.38%     221.098us        99.69%       2.123ms       2.123ms       0.000us         0.00%      21.632us      21.632us             1  
+                                               aten::to         0.54%      11.460us        78.80%       1.678ms     279.633us       0.000us         0.00%      14.304us       2.384us             6  
+                                         aten::_to_copy         2.14%      45.672us        78.26%       1.666ms     277.723us       0.000us         0.00%      14.304us       2.384us             6  
+                                            aten::copy_         2.97%      63.201us        73.51%       1.565ms     260.883us      12.000us        62.09%      14.304us       2.384us             6  
+                                           aten::conv1d         0.45%       9.560us         8.33%     177.314us      59.105us       0.000us         0.00%       7.328us       2.443us             3  
+                                      aten::convolution         0.76%      16.270us         7.88%     167.754us      55.918us       0.000us         0.00%       7.328us       2.443us             3  
+                                     aten::_convolution         1.63%      34.781us         7.11%     151.484us      50.495us       0.000us         0.00%       7.328us       2.443us             3  
+                                aten::_conv_depthwise2d         2.18%      46.460us         4.51%      96.001us      32.000us       7.328us        37.91%       7.328us       2.443us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.328us        37.91%       7.328us       2.443us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.272us        32.45%       6.272us       2.091us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.728us        29.64%       5.728us       1.909us             3  
+                                Activity Buffer Request        67.39%       1.435ms        67.39%       1.435ms       1.435ms       2.304us        11.92%       2.304us       2.304us             1  
+                                    aten::empty_strided         2.60%      55.371us         2.60%      55.371us       9.228us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         4.37%      93.031us         4.37%      93.031us      10.337us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         1.44%      30.589us         1.81%      38.620us       4.291us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.63%      13.371us         0.63%      13.371us       0.891us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.55%      11.811us         0.55%      11.811us       3.937us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.56%      11.940us         0.56%      11.940us       3.980us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.37%       7.972us         0.46%       9.712us       3.237us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.121ms
-Self CUDA time total: 19.391us
+Self CPU time total: 2.129ms
+Self CUDA time total: 19.328us
 
 
 
@@ -4014,29 +4014,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     334.942us      1707.32%     334.942us     334.942us             1  
-                                            torch_eager         7.85%     148.604us        99.72%       1.887ms       1.887ms       0.000us         0.00%      21.731us      21.731us             1  
-                                               aten::to         0.32%       6.111us        83.97%       1.589ms     264.793us       0.000us         0.00%      13.731us       2.288us             6  
-                                         aten::_to_copy         1.27%      24.112us        83.64%       1.583ms     263.774us       0.000us         0.00%      13.731us       2.288us             6  
-                                            aten::copy_         2.68%      50.691us        80.81%       1.529ms     254.829us      11.618us        59.22%      13.731us       2.288us             6  
-                                           aten::conv1d         0.29%       5.540us         6.41%     121.373us      40.458us       0.000us         0.00%       8.000us       2.667us             3  
-                                      aten::convolution         0.50%       9.420us         6.12%     115.833us      38.611us       0.000us         0.00%       8.000us       2.667us             3  
-                                     aten::_convolution         1.30%      24.670us         5.62%     106.413us      35.471us       0.000us         0.00%       8.000us       2.667us             3  
-                                aten::_conv_depthwise2d         1.20%      22.792us         3.44%      65.133us      21.711us       8.000us        40.78%       8.000us       2.667us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       8.000us        40.78%       8.000us       2.667us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.049us        30.83%       6.049us       2.016us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.569us        28.39%       5.569us       1.856us             3  
-                                Activity Buffer Request        75.63%       1.431ms        75.63%       1.431ms       1.431ms       2.113us        10.77%       2.113us       2.113us             1  
-                                    aten::empty_strided         1.56%      29.560us         1.56%      29.560us       4.927us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         3.72%      70.343us         3.72%      70.343us       7.816us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.90%      17.091us         1.18%      22.301us       2.478us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.48%       9.090us         0.48%       9.090us       0.606us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.50%       9.490us         0.50%       9.490us       3.163us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.52%       9.830us         0.52%       9.830us       3.277us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.34%       6.400us         0.42%       8.020us       2.673us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     332.863us      1691.38%     332.863us     332.863us             1  
+                                            torch_eager         6.60%     126.115us        99.71%       1.906ms       1.906ms       0.000us         0.00%      21.792us      21.792us             1  
+                                               aten::to         0.31%       5.930us        85.54%       1.635ms     272.467us       0.000us         0.00%      13.760us       2.293us             6  
+                                         aten::_to_copy         1.30%      24.791us        85.23%       1.629ms     271.478us       0.000us         0.00%      13.760us       2.293us             6  
+                                            aten::copy_         2.71%      51.809us        82.30%       1.573ms     262.158us      11.648us        59.19%      13.760us       2.293us             6  
+                                           aten::conv1d         0.31%       5.929us         6.17%     117.852us      39.284us       0.000us         0.00%       8.032us       2.677us             3  
+                                      aten::convolution         0.53%      10.111us         5.86%     111.923us      37.308us       0.000us         0.00%       8.032us       2.677us             3  
+                                     aten::_convolution         1.20%      22.951us         5.33%     101.812us      33.937us       0.000us         0.00%       8.032us       2.677us             3  
+                                aten::_conv_depthwise2d         1.20%      22.860us         3.35%      64.021us      21.340us       8.032us        40.81%       8.032us       2.677us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       8.032us        40.81%       8.032us       2.677us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.080us        30.89%       6.080us       2.027us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.568us        28.29%       5.568us       1.856us             3  
+                                Activity Buffer Request        77.00%       1.472ms        77.00%       1.472ms       1.472ms       2.112us        10.73%       2.112us       2.112us             1  
+                                    aten::empty_strided         1.63%      31.132us         1.63%      31.132us       5.189us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         3.70%      70.762us         3.70%      70.762us       7.862us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.87%      16.659us         1.16%      22.190us       2.466us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.46%       8.781us         0.46%       8.781us       0.585us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.55%      10.521us         0.55%      10.521us       3.507us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.49%       9.390us         0.49%       9.390us       3.130us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.29%       5.540us         0.35%       6.670us       2.223us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.892ms
-Self CUDA time total: 19.618us
+Self CPU time total: 1.911ms
+Self CUDA time total: 19.680us
 
 
 
@@ -4046,29 +4046,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     333.691us      1797.81%     333.691us     333.691us             1  
-                                            torch_eager         7.79%     146.606us        99.69%       1.876ms       1.876ms       0.000us         0.00%      20.481us      20.481us             1  
-                                               aten::to         0.31%       5.760us        84.09%       1.582ms     263.706us       0.000us         0.00%      13.569us       2.262us             6  
-                                         aten::_to_copy         1.25%      23.550us        83.79%       1.576ms     262.746us       0.000us         0.00%      13.569us       2.262us             6  
-                                            aten::copy_         2.67%      50.153us        80.95%       1.523ms     253.847us      11.649us        62.76%      13.569us       2.262us             6  
-                                           aten::conv1d         0.31%       5.780us         6.33%     119.033us      39.678us       0.000us         0.00%       6.912us       2.304us             3  
-                                      aten::convolution         0.52%       9.800us         6.02%     113.253us      37.751us       0.000us         0.00%       6.912us       2.304us             3  
-                                     aten::_convolution         1.28%      24.000us         5.50%     103.453us      34.484us       0.000us         0.00%       6.912us       2.304us             3  
-                                aten::_conv_depthwise2d         1.15%      21.640us         3.37%      63.473us      21.158us       6.912us        37.24%       6.912us       2.304us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       6.912us        37.24%       6.912us       2.304us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.953us        32.07%       5.953us       1.984us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.696us        30.69%       5.696us       1.899us             3  
-                                Activity Buffer Request        75.77%       1.426ms        75.77%       1.426ms       1.426ms       1.920us        10.34%       1.920us       1.920us             1  
-                                    aten::empty_strided         1.59%      29.840us         1.59%      29.840us       4.973us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         3.79%      71.241us         3.79%      71.241us       7.916us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.92%      17.220us         1.19%      22.362us       2.485us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.47%       8.782us         0.47%       8.782us       0.585us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.49%       9.312us         0.49%       9.312us       3.104us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.46%       8.581us         0.46%       8.581us       2.860us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.33%       6.290us         0.41%       7.740us       2.580us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     355.199us      1913.89%     355.199us     355.199us             1  
+                                            torch_eager         6.67%     125.171us        99.71%       1.872ms       1.872ms       0.000us         0.00%      20.511us      20.511us             1  
+                                               aten::to         0.32%       6.091us        84.23%       1.581ms     263.570us       0.000us         0.00%      13.600us       2.267us             6  
+                                         aten::_to_copy         1.32%      24.859us        83.90%       1.575ms     262.555us       0.000us         0.00%      13.600us       2.267us             6  
+                                            aten::copy_         2.70%      50.760us        80.88%       1.518ms     253.083us      11.648us        62.76%      13.600us       2.267us             6  
+                                           aten::conv1d         0.30%       5.670us         7.37%     138.423us      46.141us       0.000us         0.00%       6.911us       2.304us             3  
+                                      aten::convolution         0.52%       9.720us         7.07%     132.753us      44.251us       0.000us         0.00%       6.911us       2.304us             3  
+                                     aten::_convolution         1.24%      23.210us         6.55%     123.033us      41.011us       0.000us         0.00%       6.911us       2.304us             3  
+                                aten::_conv_depthwise2d         1.26%      23.712us         4.48%      84.033us      28.011us       6.911us        37.24%       6.911us       2.304us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       6.911us        37.24%       6.911us       2.304us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.984us        32.24%       5.984us       1.995us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.664us        30.52%       5.664us       1.888us             3  
+                                Activity Buffer Request        75.59%       1.419ms        75.59%       1.419ms       1.419ms       1.952us        10.52%       1.952us       1.952us             1  
+                                    aten::empty_strided         1.70%      31.973us         1.70%      31.973us       5.329us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         3.83%      72.002us         3.83%      72.002us       8.000us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.89%      16.661us         1.15%      21.682us       2.409us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.48%       8.941us         0.48%       8.941us       0.596us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.49%      28.041us         1.49%      28.041us       9.347us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.47%       8.840us         0.47%       8.840us       2.947us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.32%       5.960us         0.40%       7.470us       2.490us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.882ms
-Self CUDA time total: 18.561us
+Self CPU time total: 1.878ms
+Self CUDA time total: 18.559us
 
 
 
@@ -4078,29 +4078,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     341.628us      1741.67%     341.628us     341.628us             1  
-                                            torch_eager         6.79%     135.276us        99.76%       1.989ms       1.989ms       0.000us         0.00%      21.759us      21.759us             1  
-                                               aten::to         0.31%       6.091us        85.44%       1.703ms     283.911us       0.000us         0.00%      14.111us       2.352us             6  
-                                         aten::_to_copy         1.20%      23.892us        85.13%       1.697ms     282.896us       0.000us         0.00%      14.111us       2.352us             6  
-                                            aten::copy_         2.47%      49.180us        82.37%       1.642ms     273.716us      11.967us        61.01%      14.111us       2.352us             6  
-                                           aten::conv1d         0.29%       5.740us         6.09%     121.414us      40.471us       0.000us         0.00%       7.648us       2.549us             3  
-                                      aten::convolution         0.55%      11.061us         5.80%     115.674us      38.558us       0.000us         0.00%       7.648us       2.549us             3  
-                                     aten::_convolution         1.19%      23.780us         5.25%     104.613us      34.871us       0.000us         0.00%       7.648us       2.549us             3  
-                                aten::_conv_depthwise2d         1.14%      22.750us         3.26%      64.953us      21.651us       7.648us        38.99%       7.648us       2.549us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.648us        38.99%       7.648us       2.549us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.175us        31.48%       6.175us       2.058us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.792us        29.53%       5.792us       1.931us             3  
-                                Activity Buffer Request        68.82%       1.372ms        68.82%       1.372ms       1.372ms       2.144us        10.93%       2.144us       2.144us             1  
-                                    aten::empty_strided         1.56%      31.190us         1.56%      31.190us       5.198us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        12.22%     243.619us        12.22%     243.619us      27.069us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.88%      17.629us         1.14%      22.660us       2.518us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.44%       8.782us         0.44%       8.782us       0.585us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.48%       9.630us         0.48%       9.630us       3.210us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.50%       9.941us         0.50%       9.941us       3.314us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.34%       6.720us         0.41%       8.110us       2.703us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     335.841us      1714.87%     335.841us     335.841us             1  
+                                            torch_eager         6.09%     125.084us        99.75%       2.047ms       2.047ms       0.000us         0.00%      21.728us      21.728us             1  
+                                               aten::to         0.29%       6.012us        86.59%       1.777ms     296.210us       0.000us         0.00%      14.049us       2.341us             6  
+                                         aten::_to_copy         1.18%      24.318us        86.30%       1.771ms     295.209us       0.000us         0.00%      14.049us       2.341us             6  
+                                            aten::copy_         2.44%      50.170us        83.64%       1.717ms     286.105us      11.905us        60.79%      14.049us       2.341us             6  
+                                           aten::conv1d         0.29%       5.981us         5.73%     117.633us      39.211us       0.000us         0.00%       7.679us       2.560us             3  
+                                      aten::convolution         0.48%       9.909us         5.44%     111.652us      37.217us       0.000us         0.00%       7.679us       2.560us             3  
+                                     aten::_convolution         1.11%      22.712us         4.96%     101.743us      33.914us       0.000us         0.00%       7.679us       2.560us             3  
+                                aten::_conv_depthwise2d         1.08%      22.231us         3.11%      63.781us      21.260us       7.679us        39.21%       7.679us       2.560us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.679us        39.21%       7.679us       2.560us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.176us        31.54%       6.176us       2.059us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.729us        29.25%       5.729us       1.910us             3  
+                                Activity Buffer Request        70.17%       1.440ms        70.17%       1.440ms       1.440ms       2.144us        10.95%       2.144us       2.144us             1  
+                                    aten::empty_strided         1.48%      30.301us         1.48%      30.301us       5.050us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        12.02%     246.676us        12.02%     246.676us      27.408us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.85%      17.450us         1.12%      22.930us       2.548us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.44%       8.940us         0.44%       8.940us       0.596us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.47%       9.630us         0.47%       9.630us       3.210us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.56%      11.490us         0.56%      11.490us       3.830us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.28%       5.710us         0.34%       6.930us       2.310us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.994ms
-Self CUDA time total: 19.615us
+Self CPU time total: 2.053ms
+Self CUDA time total: 19.584us
 
 
 
@@ -4110,29 +4110,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     341.213us      1403.01%     341.213us     341.213us             1  
-                                            torch_eager         7.36%     148.867us        99.73%       2.016ms       2.016ms       0.000us         0.00%      26.560us      26.560us             1  
-                                               aten::to         0.30%       6.030us        84.88%       1.716ms     285.962us       0.000us         0.00%      15.168us       2.528us             6  
-                                         aten::_to_copy         1.20%      24.229us        84.58%       1.710ms     284.956us       0.000us         0.00%      15.168us       2.528us             6  
-                                            aten::copy_         2.44%      49.414us        81.85%       1.655ms     275.782us      12.928us        53.16%      15.168us       2.528us             6  
-                                           aten::conv1d         0.28%       5.730us         5.99%     121.174us      40.391us       0.000us         0.00%      11.392us       3.797us             3  
-                                      aten::convolution         0.47%       9.480us         5.71%     115.444us      38.481us       0.000us         0.00%      11.392us       3.797us             3  
-                                     aten::_convolution         1.14%      23.073us         5.24%     105.964us      35.321us       0.000us         0.00%      11.392us       3.797us             3  
-                                aten::_conv_depthwise2d         1.05%      21.189us         3.24%      65.411us      21.804us      11.392us        46.84%      11.392us       3.797us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      11.392us        46.84%      11.392us       3.797us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.592us        27.11%       6.592us       2.197us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.336us        26.05%       6.336us       2.112us             3  
-                                Activity Buffer Request        70.12%       1.417ms        70.12%       1.417ms       1.417ms       2.240us         9.21%       2.240us       2.240us             1  
-                                    aten::empty_strided         1.52%      30.820us         1.52%      30.820us       5.137us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        10.45%     211.347us        10.45%     211.347us      23.483us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.95%      19.208us         1.23%      24.829us       2.759us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.46%       9.241us         0.46%       9.241us       0.616us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.47%       9.482us         0.47%       9.482us       3.161us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.55%      11.190us         0.55%      11.190us       3.730us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.34%       6.961us         0.41%       8.361us       2.787us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     329.565us      1339.31%     329.565us     329.565us             1  
+                                            torch_eager         6.13%     122.184us        99.75%       1.990ms       1.990ms       0.000us         0.00%      26.911us      26.911us             1  
+                                               aten::to         0.30%       5.979us        86.40%       1.724ms     287.259us       0.000us         0.00%      15.359us       2.560us             6  
+                                         aten::_to_copy         1.37%      27.300us        86.10%       1.718ms     286.262us       0.000us         0.00%      15.359us       2.560us             6  
+                                            aten::copy_         2.45%      48.801us        83.22%       1.660ms     276.655us      13.055us        53.05%      15.359us       2.560us             6  
+                                           aten::conv1d         0.29%       5.841us         5.86%     116.932us      38.977us       0.000us         0.00%      11.552us       3.851us             3  
+                                      aten::convolution         0.50%       9.929us         5.57%     111.091us      37.030us       0.000us         0.00%      11.552us       3.851us             3  
+                                     aten::_convolution         1.16%      23.192us         5.07%     101.162us      33.721us       0.000us         0.00%      11.552us       3.851us             3  
+                                aten::_conv_depthwise2d         1.12%      22.341us         3.11%      62.030us      20.677us      11.552us        46.95%      11.552us       3.851us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      11.552us        46.95%      11.552us       3.851us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.688us        27.18%       6.688us       2.229us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.367us        25.87%       6.367us       2.122us             3  
+                                Activity Buffer Request        71.71%       1.430ms        71.71%       1.430ms       1.430ms       2.304us         9.36%       2.304us       2.304us             1  
+                                    aten::empty_strided         1.52%      30.342us         1.52%      30.342us       5.057us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.06%     200.744us        10.06%     200.744us      22.305us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.86%      17.251us         1.14%      22.681us       2.520us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.45%       9.051us         0.45%       9.051us       0.603us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.48%       9.579us         0.48%       9.579us       3.193us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.50%      10.050us         0.50%      10.050us       3.350us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.30%       6.019us         0.36%       7.270us       2.423us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.022ms
-Self CUDA time total: 24.320us
+Self CPU time total: 1.995ms
+Self CUDA time total: 24.607us
 
 
 
@@ -4142,29 +4142,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     334.330us      1285.14%     334.330us     334.330us             1  
-                                            torch_eager         7.10%     143.875us        99.74%       2.020ms       2.020ms       0.000us         0.00%      28.255us      28.255us             1  
-                                               aten::to         0.28%       5.680us        85.25%       1.727ms     287.810us       0.000us         0.00%      15.232us       2.539us             6  
-                                         aten::_to_copy         1.18%      23.873us        84.97%       1.721ms     286.863us       0.000us         0.00%      15.232us       2.539us             6  
-                                            aten::copy_         2.45%      49.640us        82.36%       1.668ms     278.038us      12.992us        49.94%      15.232us       2.539us             6  
-                                           aten::conv1d         0.29%       5.889us         5.94%     120.414us      40.138us       0.000us         0.00%      13.023us       4.341us             3  
-                                      aten::convolution         0.46%       9.401us         5.65%     114.525us      38.175us       0.000us         0.00%      13.023us       4.341us             3  
-                                     aten::_convolution         1.22%      24.611us         5.19%     105.124us      35.041us       0.000us         0.00%      13.023us       4.341us             3  
-                                aten::_conv_depthwise2d         1.06%      21.480us         3.19%      64.562us      21.521us      13.023us        50.06%      13.023us       4.341us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      13.023us        50.06%      13.023us       4.341us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.624us        25.46%       6.624us       2.208us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.368us        24.48%       6.368us       2.123us             3  
-                                Activity Buffer Request        71.17%       1.442ms        71.17%       1.442ms       1.442ms       2.240us         8.61%       2.240us       2.240us             1  
-                                    aten::empty_strided         1.44%      29.082us         1.44%      29.082us       4.847us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         9.85%     199.548us         9.85%     199.548us      22.172us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.91%      18.470us         1.17%      23.650us       2.628us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.44%       8.970us         0.44%       8.970us       0.598us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.51%      10.400us         0.51%      10.400us       3.467us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.50%      10.200us         0.50%      10.200us       3.400us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.30%       6.091us         0.38%       7.621us       2.540us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     358.812us      1379.20%     358.812us     358.812us             1  
+                                            torch_eager         6.94%     139.423us        99.75%       2.005ms       2.005ms       0.000us         0.00%      28.256us      28.256us             1  
+                                               aten::to         0.33%       6.550us        85.45%       1.717ms     286.205us       0.000us         0.00%      15.199us       2.533us             6  
+                                         aten::_to_copy         1.20%      24.182us        85.13%       1.711ms     285.114us       0.000us         0.00%      15.199us       2.533us             6  
+                                            aten::copy_         2.59%      52.130us        82.30%       1.654ms     275.648us      12.959us        49.81%      15.199us       2.533us             6  
+                                           aten::conv1d         0.30%       6.120us         5.97%     119.993us      39.998us       0.000us         0.00%      13.057us       4.352us             3  
+                                      aten::convolution         0.48%       9.660us         5.67%     113.873us      37.958us       0.000us         0.00%      13.057us       4.352us             3  
+                                     aten::_convolution         1.13%      22.802us         5.19%     104.213us      34.738us       0.000us         0.00%      13.057us       4.352us             3  
+                                aten::_conv_depthwise2d         1.09%      21.932us         3.25%      65.242us      21.747us      13.057us        50.19%      13.057us       4.352us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      13.057us        50.19%      13.057us       4.352us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.623us        25.46%       6.623us       2.208us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.336us        24.35%       6.336us       2.112us             3  
+                                Activity Buffer Request        70.68%       1.420ms        70.68%       1.420ms       1.420ms       2.240us         8.61%       2.240us       2.240us             1  
+                                    aten::empty_strided         1.62%      32.611us         1.62%      32.611us       5.435us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.17%     204.364us        10.17%     204.364us      22.707us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.88%      17.647us         1.15%      23.189us       2.577us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.47%       9.382us         0.47%       9.382us       0.625us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.58%      11.651us         0.58%      11.651us       3.884us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.44%       8.769us         0.44%       8.769us       2.923us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.32%       6.420us         0.39%       7.890us       2.630us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.026ms
-Self CUDA time total: 26.015us
+Self CPU time total: 2.010ms
+Self CUDA time total: 26.016us
 
 
 
@@ -4174,29 +4174,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     340.315us       888.50%     340.315us     340.315us             1  
-                                            torch_eager         7.29%     147.016us        99.74%       2.012ms       2.012ms       0.000us         0.00%      40.894us      40.894us             1  
-                                           aten::conv1d         0.29%       5.920us         5.91%     119.264us      39.755us       0.000us         0.00%      22.496us       7.499us             3  
-                                      aten::convolution         0.47%       9.411us         5.62%     113.344us      37.781us       0.000us         0.00%      22.496us       7.499us             3  
-                                     aten::_convolution         1.19%      23.960us         5.15%     103.933us      34.644us       0.000us         0.00%      22.496us       7.499us             3  
-                                aten::_conv_depthwise2d         1.11%      22.310us         3.18%      64.143us      21.381us      22.496us        58.73%      22.496us       7.499us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      22.496us        58.73%      22.496us       7.499us             3  
-                                               aten::to         0.29%       5.851us        85.12%       1.717ms     286.238us       0.000us         0.00%      18.398us       3.066us             6  
-                                         aten::_to_copy         1.17%      23.549us        84.83%       1.712ms     285.263us       0.000us         0.00%      18.398us       3.066us             6  
-                                            aten::copy_         2.43%      48.960us        82.11%       1.657ms     276.121us      15.806us        41.27%      18.398us       3.066us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.416us        21.97%       8.416us       2.805us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.390us        19.29%       7.390us       2.463us             3  
-                                Activity Buffer Request        70.87%       1.430ms        70.87%       1.430ms       1.430ms       2.592us         6.77%       2.592us       2.592us             1  
-                                    aten::empty_strided         1.55%      31.301us         1.55%      31.301us       5.217us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         9.91%     199.938us         9.91%     199.938us      22.215us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.87%      17.540us         1.13%      22.711us       2.523us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.44%       8.912us         0.44%       8.912us       0.594us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.47%       9.390us         0.47%       9.390us       3.130us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.51%      10.361us         0.51%      10.361us       3.454us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.30%       6.100us         0.37%       7.550us       2.517us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     328.896us       853.65%     328.896us     328.896us             1  
+                                            torch_eager         6.29%     121.493us        99.73%       1.928ms       1.928ms       0.000us         0.00%      41.088us      41.088us             1  
+                                           aten::conv1d         0.31%       5.961us         6.00%     115.903us      38.634us       0.000us         0.00%      22.688us       7.563us             3  
+                                      aten::convolution         0.50%       9.600us         5.69%     109.942us      36.647us       0.000us         0.00%      22.688us       7.563us             3  
+                                     aten::_convolution         1.16%      22.510us         5.19%     100.342us      33.447us       0.000us         0.00%      22.688us       7.563us             3  
+                                aten::_conv_depthwise2d         1.17%      22.551us         3.25%      62.881us      20.960us      22.688us        58.89%      22.688us       7.563us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      22.688us        58.89%      22.688us       7.563us             3  
+                                               aten::to         0.33%       6.421us        86.08%       1.664ms     277.308us       0.000us         0.00%      18.400us       3.067us             6  
+                                         aten::_to_copy         1.25%      24.161us        85.75%       1.657ms     276.238us       0.000us         0.00%      18.400us       3.067us             6  
+                                            aten::copy_         2.57%      49.759us        82.93%       1.603ms     267.166us      15.840us        41.11%      18.400us       3.067us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.448us        21.93%       8.448us       2.816us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.392us        19.19%       7.392us       2.464us             3  
+                                Activity Buffer Request        71.07%       1.374ms        71.07%       1.374ms       1.374ms       2.560us         6.64%       2.560us       2.560us             1  
+                                    aten::empty_strided         1.57%      30.271us         1.57%      30.271us       5.045us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.43%     201.525us        10.43%     201.525us      22.392us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.86%      16.701us         1.14%      22.001us       2.445us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.45%       8.751us         0.45%       8.751us       0.583us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.48%       9.290us         0.48%       9.290us       3.097us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.47%       9.060us         0.47%       9.060us       3.020us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.28%       5.459us         0.35%       6.690us       2.230us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.018ms
-Self CUDA time total: 38.302us
+Self CPU time total: 1.933ms
+Self CUDA time total: 38.528us
 
 
 
@@ -4206,29 +4206,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     363.388us       882.35%     363.388us     363.388us             1  
-                                            torch_eager         8.20%     165.958us        99.73%       2.020ms       2.020ms       0.000us         0.00%      43.808us      43.808us             1  
-                                           aten::conv1d         0.32%       6.510us         6.06%     122.733us      40.911us       0.000us         0.00%      25.408us       8.469us             3  
-                                      aten::convolution         0.48%       9.730us         5.74%     116.223us      38.741us       0.000us         0.00%      25.408us       8.469us             3  
-                                     aten::_convolution         1.17%      23.611us         5.26%     106.493us      35.498us       0.000us         0.00%      25.408us       8.469us             3  
-                                aten::_conv_depthwise2d         1.11%      22.549us         3.28%      66.422us      22.141us      25.408us        61.69%      25.408us       8.469us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      25.408us        61.69%      25.408us       8.469us             3  
-                                               aten::to         0.31%       6.220us        83.98%       1.701ms     283.450us       0.000us         0.00%      18.400us       3.067us             6  
-                                         aten::_to_copy         1.16%      23.591us        83.68%       1.694ms     282.413us       0.000us         0.00%      18.400us       3.067us             6  
-                                            aten::copy_         2.51%      50.781us        81.00%       1.640ms     273.388us      15.776us        38.31%      18.400us       3.067us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.352us        20.28%       8.352us       2.784us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.424us        18.03%       7.424us       2.475us             3  
-                                Activity Buffer Request        69.68%       1.411ms        69.68%       1.411ms       1.411ms       2.624us         6.37%       2.624us       2.624us             1  
-                                    aten::empty_strided         1.51%      30.560us         1.51%      30.560us       5.093us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         9.99%     202.397us         9.99%     202.397us      22.489us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.88%      17.759us         1.14%      23.000us       2.556us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.46%       9.250us         0.46%       9.250us       0.617us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.51%      10.382us         0.51%      10.382us       3.461us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.48%       9.651us         0.48%       9.651us       3.217us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.33%       6.630us         0.40%       8.160us       2.720us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     334.458us       810.83%     334.458us     334.458us             1  
+                                            torch_eager         6.32%     125.394us        99.75%       1.978ms       1.978ms       0.000us         0.00%      43.841us      43.841us             1  
+                                           aten::conv1d         0.30%       5.899us         5.88%     116.562us      38.854us       0.000us         0.00%      25.600us       8.533us             3  
+                                      aten::convolution         0.49%       9.810us         5.58%     110.663us      36.888us       0.000us         0.00%      25.600us       8.533us             3  
+                                     aten::_convolution         1.13%      22.411us         5.09%     100.853us      33.618us       0.000us         0.00%      25.600us       8.533us             3  
+                                aten::_conv_depthwise2d         1.14%      22.520us         3.20%      63.392us      21.131us      25.600us        62.06%      25.600us       8.533us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      25.600us        62.06%      25.600us       8.533us             3  
+                                               aten::to         0.30%       5.959us        86.14%       1.708ms     284.675us       0.000us         0.00%      18.241us       3.040us             6  
+                                         aten::_to_copy         1.33%      26.372us        85.84%       1.702ms     283.682us       0.000us         0.00%      18.241us       3.040us             6  
+                                            aten::copy_         2.49%      49.420us        83.02%       1.646ms     274.363us      15.649us        37.94%      18.241us       3.040us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.321us        20.17%       8.321us       2.774us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.328us        17.77%       7.328us       2.443us             3  
+                                Activity Buffer Request        71.51%       1.418ms        71.51%       1.418ms       1.418ms       2.592us         6.28%       2.592us       2.592us             1  
+                                    aten::empty_strided         1.49%      29.540us         1.49%      29.540us       4.923us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.06%     199.427us        10.06%     199.427us      22.159us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.92%      18.199us         1.18%      23.330us       2.592us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.44%       8.651us         0.44%       8.651us       0.577us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.54%      10.640us         0.54%      10.640us       3.547us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.48%       9.610us         0.48%       9.610us       3.203us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.28%       5.590us         0.34%       6.770us       2.257us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.025ms
-Self CUDA time total: 41.184us
+Self CPU time total: 1.983ms
+Self CUDA time total: 41.249us
 
 
 
@@ -4238,29 +4238,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     352.830us       343.38%     352.830us     352.830us             1  
-                                            torch_eager         7.15%     144.983us        99.76%       2.023ms       2.023ms       0.000us         0.00%     108.768us     108.768us             1  
-                                           aten::conv1d         0.29%       5.781us         5.92%     120.074us      40.025us       0.000us         0.00%      70.432us      23.477us             3  
-                                      aten::convolution         0.47%       9.599us         5.64%     114.293us      38.098us       0.000us         0.00%      70.432us      23.477us             3  
-                                     aten::_convolution         1.14%      23.149us         5.16%     104.694us      34.898us       0.000us         0.00%      70.432us      23.477us             3  
-                                aten::_conv_depthwise2d         1.16%      23.581us         3.22%      65.212us      21.737us      70.432us        68.55%      70.432us      23.477us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      70.432us        68.55%      70.432us      23.477us             3  
-                                               aten::to         0.30%       6.111us        85.26%       1.729ms     288.085us       0.000us         0.00%      38.336us       6.389us             6  
-                                         aten::_to_copy         1.62%      32.820us        84.95%       1.722ms     287.067us       0.000us         0.00%      38.336us       6.389us             6  
-                                            aten::copy_         2.46%      49.781us        81.90%       1.660ms     276.745us      32.320us        31.45%      38.336us       6.389us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      17.696us        17.22%      17.696us       5.899us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      14.624us        14.23%      14.624us       4.875us             3  
-                                Activity Buffer Request        70.70%       1.433ms        70.70%       1.433ms       1.433ms       6.016us         5.85%       6.016us       6.016us             1  
-                                    aten::empty_strided         1.44%      29.111us         1.44%      29.111us       4.852us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         9.89%     200.449us         9.89%     200.449us      22.272us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.88%      17.943us         1.16%      23.512us       2.612us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.46%       9.330us         0.46%       9.330us       0.622us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.47%       9.471us         0.47%       9.471us       3.157us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.45%       9.050us         0.45%       9.050us       3.017us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.32%       6.391us         0.39%       7.911us       2.637us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     338.849us       326.92%     338.849us     338.849us             1  
+                                            torch_eager         5.95%     117.585us        99.74%       1.970ms       1.970ms       0.000us         0.00%     109.697us     109.697us             1  
+                                           aten::conv1d         0.30%       5.970us         6.05%     119.502us      39.834us       0.000us         0.00%      71.232us      23.744us             3  
+                                      aten::convolution         0.49%       9.700us         5.75%     113.532us      37.844us       0.000us         0.00%      71.232us      23.744us             3  
+                                     aten::_convolution         1.15%      22.781us         5.26%     103.832us      34.611us       0.000us         0.00%      71.232us      23.744us             3  
+                                aten::_conv_depthwise2d         1.18%      23.259us         3.31%      65.420us      21.807us      71.232us        68.72%      71.232us      23.744us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      71.232us        68.72%      71.232us      23.744us             3  
+                                               aten::to         0.31%       6.199us        86.38%       1.706ms     284.313us       0.000us         0.00%      38.465us       6.411us             6  
+                                         aten::_to_copy         1.31%      25.891us        86.06%       1.700ms     283.280us       0.000us         0.00%      38.465us       6.411us             6  
+                                            aten::copy_         2.57%      50.812us        83.17%       1.643ms     273.758us      32.417us        31.28%      38.465us       6.411us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      17.760us        17.13%      17.760us       5.920us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      14.657us        14.14%      14.657us       4.886us             3  
+                                Activity Buffer Request        71.61%       1.414ms        71.61%       1.414ms       1.414ms       6.048us         5.84%       6.048us       6.048us             1  
+                                    aten::empty_strided         1.58%      31.240us         1.58%      31.240us       5.207us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.13%     200.155us        10.13%     200.155us      22.239us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.87%      17.181us         1.15%      22.621us       2.513us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.45%       8.941us         0.45%       8.941us       0.596us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.51%      10.050us         0.51%      10.050us       3.350us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.47%       9.370us         0.47%       9.370us       3.123us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.28%       5.551us         0.35%       6.851us       2.284us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.027ms
-Self CUDA time total: 102.752us
+Self CPU time total: 1.975ms
+Self CUDA time total: 103.649us
 
 
 
@@ -4270,29 +4270,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     330.363us       292.97%     330.363us     330.363us             1  
-                                            torch_eager        16.10%     118.634us        99.31%     731.955us     731.955us       0.000us         0.00%     118.781us     118.781us             1  
-                                           aten::conv1d         0.80%       5.881us        15.92%     117.344us      39.115us       0.000us         0.00%      80.541us      26.847us             3  
-                                      aten::convolution         1.32%       9.760us        15.12%     111.463us      37.154us       0.000us         0.00%      80.541us      26.847us             3  
-                                     aten::_convolution         3.06%      22.540us        13.80%     101.703us      33.901us       0.000us         0.00%      80.541us      26.847us             3  
-                                aten::_conv_depthwise2d         2.83%      20.841us         8.49%      62.593us      20.864us      80.541us        71.42%      80.541us      26.847us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      80.541us        71.42%      80.541us      26.847us             3  
-                                               aten::to         0.79%       5.790us        63.53%     468.255us      78.043us       0.000us         0.00%      38.240us       6.373us             6  
-                                         aten::_to_copy         3.21%      23.660us        62.75%     462.465us      77.078us       0.000us         0.00%      38.240us       6.373us             6  
-                                            aten::copy_         6.76%      49.831us        55.55%     409.415us      68.236us      32.224us        28.58%      38.240us       6.373us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      17.728us        15.72%      17.728us       5.909us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      14.496us        12.86%      14.496us       4.832us             3  
-                                Activity Buffer Request        25.24%     185.996us        25.24%     185.996us     185.996us       6.016us         5.33%       6.016us       6.016us             1  
-                                    aten::empty_strided         3.99%      29.390us         3.99%      29.390us       4.898us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        26.60%     196.028us        26.60%     196.028us      21.781us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.44%      17.960us         3.11%      22.951us       2.550us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.15%       8.461us         1.15%       8.461us       0.564us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.35%       9.931us         1.35%       9.931us       3.310us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.27%       9.381us         1.27%       9.381us       3.127us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.87%       6.430us         1.06%       7.840us       2.613us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     357.597us       314.53%     357.597us     357.597us             1  
+                                            torch_eager         6.01%     120.196us        99.73%       1.995ms       1.995ms       0.000us         0.00%     119.645us     119.645us             1  
+                                           aten::conv1d         0.28%       5.578us         6.85%     137.112us      45.704us       0.000us         0.00%      81.344us      27.115us             3  
+                                      aten::convolution         0.47%       9.452us         6.58%     131.534us      43.845us       0.000us         0.00%      81.344us      27.115us             3  
+                                     aten::_convolution         1.16%      23.298us         6.10%     122.082us      40.694us       0.000us         0.00%      81.344us      27.115us             3  
+                                aten::_conv_depthwise2d         1.16%      23.221us         4.15%      82.932us      27.644us      81.344us        71.55%      81.344us      27.115us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      81.344us        71.55%      81.344us      27.115us             3  
+                                               aten::to         0.33%       6.509us        85.46%       1.710ms     284.935us       0.000us         0.00%      38.301us       6.383us             6  
+                                         aten::_to_copy         1.29%      25.870us        85.14%       1.703ms     283.850us       0.000us         0.00%      38.301us       6.383us             6  
+                                            aten::copy_         2.58%      51.531us        82.27%       1.646ms     274.308us      32.350us        28.45%      38.301us       6.383us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      17.727us        15.59%      17.727us       5.909us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      14.623us        12.86%      14.623us       4.874us             3  
+                                Activity Buffer Request        70.95%       1.419ms        70.95%       1.419ms       1.419ms       5.951us         5.23%       5.951us       5.951us             1  
+                                    aten::empty_strided         1.57%      31.380us         1.57%      31.380us       5.230us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         9.95%     199.044us         9.95%     199.044us      22.116us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.89%      17.740us         1.16%      23.191us       2.577us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.47%       9.433us         0.47%       9.433us       0.629us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.53%      10.531us         0.53%      10.531us       3.510us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.26%      25.130us         1.26%      25.130us       8.377us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.30%       6.010us         0.38%       7.612us       2.537us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 737.005us
-Self CUDA time total: 112.765us
+Self CPU time total: 2.000ms
+Self CUDA time total: 113.694us
 
 
 
@@ -4302,29 +4302,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager        22.21%     170.695us        99.32%     763.366us     763.366us       0.000us         0.00%     430.770us     430.770us             1  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     416.723us       106.46%     416.723us     416.723us             1  
-                                           aten::conv1d         0.77%       5.951us        14.86%     114.225us      38.075us       0.000us         0.00%     251.288us      83.763us             3  
-                                      aten::convolution         1.24%       9.541us        14.09%     108.274us      36.091us       0.000us         0.00%     251.288us      83.763us             3  
-                                     aten::_convolution         2.83%      21.719us        12.85%      98.733us      32.911us       0.000us         0.00%     251.288us      83.763us             3  
-                                aten::_conv_depthwise2d         2.74%      21.061us         7.99%      61.422us      20.474us     251.288us        64.20%     251.288us      83.763us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     251.288us        64.20%     251.288us      83.763us             3  
-                                               aten::to         0.75%       5.750us        58.89%     452.676us      75.446us       0.000us         0.00%     179.482us      29.914us             6  
-                                         aten::_to_copy         3.02%      23.182us        58.15%     446.926us      74.488us       0.000us         0.00%     179.482us      29.914us             6  
-                                            aten::copy_         6.40%      49.211us        51.58%     396.473us      66.079us     140.155us        35.80%     179.482us      29.914us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     100.254us        25.61%     100.254us      33.418us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      39.901us        10.19%      39.901us      13.300us             3  
-                                Activity Buffer Request        22.72%     174.636us        22.72%     174.636us     174.636us      39.327us        10.05%      39.327us      39.327us             1  
-                                    aten::empty_strided         3.55%      27.271us         3.55%      27.271us       4.545us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        25.36%     194.936us        25.36%     194.936us      21.660us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.13%      16.381us         2.81%      21.611us       2.401us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.16%       8.880us         1.16%       8.880us       0.592us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.18%       9.091us         1.18%       9.091us       3.030us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.17%       8.960us         1.17%       8.960us       2.987us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.75%       5.770us         0.94%       7.191us       2.397us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         5.97%     120.782us        97.66%       1.975ms       1.975ms       0.000us         0.00%     434.301us     434.301us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     421.021us       106.85%     421.021us     421.021us             1  
+                                           aten::conv1d         0.30%       6.069us         5.79%     117.202us      39.067us       0.000us         0.00%     251.007us      83.669us             3  
+                                      aten::convolution         0.47%       9.471us         5.49%     111.133us      37.044us       0.000us         0.00%     251.007us      83.669us             3  
+                                     aten::_convolution         1.10%      22.180us         5.03%     101.662us      33.887us       0.000us         0.00%     251.007us      83.669us             3  
+                                aten::_conv_depthwise2d         1.13%      22.779us         3.17%      64.182us      21.394us     251.007us        63.71%     251.007us      83.669us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     251.007us        63.71%     251.007us      83.669us             3  
+                                               aten::to         0.31%       6.200us        84.52%       1.710ms     284.917us       0.000us         0.00%     183.294us      30.549us             6  
+                                         aten::_to_copy         1.19%      24.072us        84.22%       1.703ms     283.884us       0.000us         0.00%     183.294us      30.549us             6  
+                                            aten::copy_         2.45%      49.593us        81.56%       1.650ms     274.942us     143.007us        36.29%     183.294us      30.549us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     102.495us        26.01%     102.495us      34.165us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.512us        10.28%      40.512us      13.504us             3  
+                                Activity Buffer Request        70.36%       1.423ms        70.36%       1.423ms       1.423ms      40.287us        10.22%      40.287us      40.287us             1  
+                                    aten::empty_strided         1.46%      29.579us         1.46%      29.579us       4.930us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         9.86%     199.474us         9.86%     199.474us      22.164us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.84%      17.021us         1.11%      22.432us       2.492us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.45%       9.090us         0.45%       9.090us       0.606us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.48%       9.720us         0.48%       9.720us       3.240us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.45%       9.202us         0.45%       9.202us       3.067us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.28%       5.680us         0.35%       7.060us       2.353us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 768.616us
-Self CUDA time total: 391.443us
+Self CPU time total: 2.023ms
+Self CUDA time total: 394.014us
 
 
 
@@ -4334,29 +4334,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager        13.26%     117.114us        87.73%     774.557us     774.557us       0.000us         0.00%     486.014us     486.014us             1  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     473.342us       105.98%     473.342us     473.342us             1  
-                                           aten::conv1d         0.63%       5.520us        13.02%     114.943us      38.314us       0.000us         0.00%     298.622us      99.541us             3  
-                                      aten::convolution         1.08%       9.570us        12.39%     109.423us      36.474us       0.000us         0.00%     298.622us      99.541us             3  
-                                     aten::_convolution         2.49%      22.001us        11.31%      99.853us      33.284us       0.000us         0.00%     298.622us      99.541us             3  
-                                aten::_conv_depthwise2d         2.40%      21.190us         7.05%      62.252us      20.751us     298.622us        66.86%     298.622us      99.541us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     298.622us        66.86%     298.622us      99.541us             3  
-                                               aten::to         0.65%       5.781us        58.29%     514.667us      85.778us       0.000us         0.00%     187.392us      31.232us             6  
-                                         aten::_to_copy         2.57%      22.699us        57.64%     508.886us      84.814us       0.000us         0.00%     187.392us      31.232us             6  
-                                            aten::copy_         5.62%      49.650us        51.80%     457.366us      76.228us     148.032us        33.14%     187.392us      31.232us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     108.256us        24.24%     108.256us      36.085us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      39.776us         8.91%      39.776us      13.259us             3  
-                                Activity Buffer Request        26.78%     236.449us        26.78%     236.449us     236.449us      39.360us         8.81%      39.360us      39.360us             1  
-                                    aten::empty_strided         3.26%      28.821us         3.26%      28.821us       4.804us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        22.01%     194.327us        22.01%     194.327us      21.592us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.00%      17.701us         2.60%      22.912us       2.546us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.01%       8.901us         1.01%       8.901us       0.593us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.05%       9.311us         1.05%       9.311us       3.104us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.98%       8.691us         0.98%       8.691us       2.897us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.65%       5.750us         0.82%       7.230us       2.410us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         5.89%     122.072us        95.29%       1.975ms       1.975ms       0.000us         0.00%     486.458us     486.458us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     474.010us       106.16%     474.010us     474.010us             1  
+                                           aten::conv1d         0.28%       5.830us         5.59%     115.853us      38.618us       0.000us         0.00%     299.291us      99.764us             3  
+                                      aten::convolution         0.46%       9.610us         5.31%     110.023us      36.674us       0.000us         0.00%     299.291us      99.764us             3  
+                                     aten::_convolution         1.08%      22.439us         4.85%     100.413us      33.471us       0.000us         0.00%     299.291us      99.764us             3  
+                                aten::_conv_depthwise2d         1.04%      21.490us         3.04%      62.983us      20.994us     299.291us        67.03%     299.291us      99.764us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     299.291us        67.03%     299.291us      99.764us             3  
+                                               aten::to         0.31%       6.341us        82.51%       1.710ms     284.962us       0.000us         0.00%     187.167us      31.195us             6  
+                                         aten::_to_copy         1.23%      25.592us        82.20%       1.703ms     283.906us       0.000us         0.00%     187.167us      31.195us             6  
+                                            aten::copy_         2.39%      49.481us        79.48%       1.647ms     274.512us     147.199us        32.97%     187.167us      31.195us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     106.911us        23.94%     106.911us      35.637us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.288us         9.02%      40.288us      13.429us             3  
+                                Activity Buffer Request        68.62%       1.422ms        68.62%       1.422ms       1.422ms      39.968us         8.95%      39.968us      39.968us             1  
+                                    aten::empty_strided         1.48%      30.770us         1.48%      30.770us       5.128us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         9.53%     197.485us         9.53%     197.485us      21.943us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.81%      16.791us         1.08%      22.301us       2.478us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.44%       9.141us         0.44%       9.141us       0.609us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.47%       9.701us         0.47%       9.701us       3.234us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.48%       9.941us         0.48%       9.941us       3.314us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.27%       5.510us         0.33%       6.790us       2.263us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 882.890us
-Self CUDA time total: 446.654us
+Self CPU time total: 2.072ms
+Self CUDA time total: 446.490us
 
 
 
@@ -4366,29 +4366,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     324.122us      1734.57%     324.122us     324.122us             1  
-                                            torch_eager        15.60%     121.627us        99.38%     775.067us     775.067us       0.000us         0.00%      20.574us      20.574us             1  
-                                               aten::to         0.72%       5.589us        65.70%     512.356us      85.393us       0.000us         0.00%      13.343us       2.224us             6  
-                                         aten::_to_copy         2.88%      22.431us        64.98%     506.767us      84.461us       0.000us         0.00%      13.343us       2.224us             6  
-                                            aten::copy_         6.46%      50.411us        58.51%     456.326us      76.054us      11.455us        61.30%      13.343us       2.224us             6  
-                                           aten::conv1d         0.72%       5.580us        14.59%     113.823us      37.941us       0.000us         0.00%       7.231us       2.410us             3  
-                                      aten::convolution         1.19%       9.260us        13.88%     108.243us      36.081us       0.000us         0.00%       7.231us       2.410us             3  
-                                     aten::_convolution         2.87%      22.359us        12.69%      98.983us      32.994us       0.000us         0.00%       7.231us       2.410us             3  
-                                aten::_conv_depthwise2d         2.67%      20.840us         7.84%      61.153us      20.384us       7.231us        38.70%       7.231us       2.410us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.231us        38.70%       7.231us       2.410us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.856us        31.34%       5.856us       1.952us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.599us        29.96%       5.599us       1.866us             3  
-                                Activity Buffer Request        30.21%     235.608us        30.21%     235.608us     235.608us       1.888us        10.10%       1.888us       1.888us             1  
-                                    aten::empty_strided         3.59%      28.010us         3.59%      28.010us       4.668us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        24.63%     192.088us        24.63%     192.088us      21.343us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.29%      17.871us         2.95%      23.001us       2.556us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.13%       8.820us         1.13%       8.820us       0.588us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.21%       9.401us         1.21%       9.401us       3.134us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.17%       9.131us         1.17%       9.131us       3.044us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.75%       5.851us         0.94%       7.321us       2.440us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     358.523us      1924.96%     358.523us     358.523us             1  
+                                            torch_eager        17.94%     139.773us        99.33%     774.049us     774.049us       0.000us         0.00%      20.513us      20.513us             1  
+                                               aten::to         0.94%       7.351us        62.88%     489.983us      81.664us       0.000us         0.00%      13.376us       2.229us             6  
+                                         aten::_to_copy         3.20%      24.930us        61.93%     482.632us      80.439us       0.000us         0.00%      13.376us       2.229us             6  
+                                            aten::copy_         6.90%      53.742us        54.52%     424.881us      70.813us      11.488us        61.68%      13.376us       2.229us             6  
+                                           aten::conv1d         0.75%       5.841us        15.01%     116.973us      38.991us       0.000us         0.00%       7.137us       2.379us             3  
+                                      aten::convolution         1.33%      10.360us        14.26%     111.132us      37.044us       0.000us         0.00%       7.137us       2.379us             3  
+                                     aten::_convolution         3.01%      23.430us        12.93%     100.772us      33.591us       0.000us         0.00%       7.137us       2.379us             3  
+                                aten::_conv_depthwise2d         2.81%      21.882us         7.98%      62.192us      20.731us       7.137us        38.32%       7.137us       2.379us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.137us        38.32%       7.137us       2.379us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.888us        31.61%       5.888us       1.963us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.600us        30.07%       5.600us       1.867us             3  
+                                Activity Buffer Request        24.98%     194.695us        24.98%     194.695us     194.695us       1.888us        10.14%       1.888us       1.888us             1  
+                                    aten::empty_strided         4.21%      32.821us         4.21%      32.821us       5.470us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        25.28%     197.004us        25.28%     197.004us      21.889us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.16%      16.850us         2.84%      22.160us       2.462us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.13%       8.821us         1.13%       8.821us       0.588us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.22%       9.521us         1.22%       9.521us       3.174us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.31%      10.229us         1.31%      10.229us       3.410us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.74%       5.740us         0.90%       7.020us       2.340us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 779.877us
-Self CUDA time total: 18.686us
+Self CPU time total: 779.258us
+Self CUDA time total: 18.625us
 
 
 
@@ -4398,29 +4398,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     316.346us      1628.63%     316.346us     316.346us             1  
-                                            torch_eager        14.51%     117.604us        99.38%     805.188us     805.188us       0.000us         0.00%      21.312us      21.312us             1  
-                                               aten::to         0.69%       5.621us        67.40%     546.068us      91.011us       0.000us         0.00%      13.376us       2.229us             6  
-                                         aten::_to_copy         2.81%      22.789us        66.70%     540.447us      90.075us       0.000us         0.00%      13.376us       2.229us             6  
-                                            aten::copy_         5.89%      47.733us        60.20%     487.757us      81.293us      11.488us        59.14%      13.376us       2.229us             6  
-                                           aten::conv1d         0.69%       5.581us        14.11%     114.294us      38.098us       0.000us         0.00%       7.936us       2.645us             3  
-                                      aten::convolution         1.17%       9.520us        13.42%     108.713us      36.238us       0.000us         0.00%       7.936us       2.645us             3  
-                                     aten::_convolution         2.68%      21.682us        12.24%      99.193us      33.064us       0.000us         0.00%       7.936us       2.645us             3  
-                                aten::_conv_depthwise2d         2.64%      21.391us         7.61%      61.682us      20.561us       7.936us        40.86%       7.936us       2.645us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.936us        40.86%       7.936us       2.645us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.856us        30.15%       5.856us       1.952us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.632us        29.00%       5.632us       1.877us             3  
-                                Activity Buffer Request        33.53%     271.649us        33.53%     271.649us     271.649us       1.888us         9.72%       1.888us       1.888us             1  
-                                    aten::empty_strided         3.69%      29.901us         3.69%      29.901us       4.984us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        23.39%     189.555us        23.39%     189.555us      21.062us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.18%      17.698us         2.81%      22.771us       2.530us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.07%       8.674us         1.07%       8.674us       0.578us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.14%       9.260us         1.14%       9.260us       3.087us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.22%       9.851us         1.22%       9.851us       3.284us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.76%       6.120us         0.93%       7.530us       2.510us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     328.763us      1698.07%     328.763us     328.763us             1  
+                                            torch_eager        14.65%     115.015us        99.34%     779.670us     779.670us       0.000us         0.00%      21.248us      21.248us             1  
+                                               aten::to         0.80%       6.290us        66.21%     519.631us      86.605us       0.000us         0.00%      13.406us       2.234us             6  
+                                         aten::_to_copy         3.14%      24.649us        65.41%     513.341us      85.557us       0.000us         0.00%      13.406us       2.234us             6  
+                                            aten::copy_         6.80%      53.351us        58.20%     456.761us      76.127us      11.519us        59.50%      13.406us       2.234us             6  
+                                           aten::conv1d         0.75%       5.880us        15.10%     118.484us      39.495us       0.000us         0.00%       7.842us       2.614us             3  
+                                      aten::convolution         1.21%       9.513us        14.35%     112.604us      37.535us       0.000us         0.00%       7.842us       2.614us             3  
+                                     aten::_convolution         2.83%      22.229us        13.14%     103.091us      34.364us       0.000us         0.00%       7.842us       2.614us             3  
+                                aten::_conv_depthwise2d         3.15%      24.720us         8.43%      66.141us      22.047us       7.842us        40.50%       7.842us       2.614us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.842us        40.50%       7.842us       2.614us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.887us        30.41%       5.887us       1.962us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.632us        29.09%       5.632us       1.877us             3  
+                                Activity Buffer Request        29.55%     231.946us        29.55%     231.946us     231.946us       1.887us         9.75%       1.887us       1.887us             1  
+                                    aten::empty_strided         4.07%      31.931us         4.07%      31.931us       5.322us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        24.68%     193.684us        24.68%     193.684us      21.520us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.11%      16.541us         2.75%      21.581us       2.398us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.09%       8.568us         1.09%       8.568us       0.571us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.27%       9.951us         1.27%       9.951us       3.317us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.18%       9.250us         1.18%       9.250us       3.083us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.72%       5.642us         0.89%       6.980us       2.327us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 810.248us
-Self CUDA time total: 19.424us
+Self CPU time total: 784.850us
+Self CUDA time total: 19.361us
 
 
 
@@ -4430,29 +4430,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     322.590us      1658.13%     322.590us     322.590us             1  
-                                            torch_eager         6.77%     135.447us        99.76%       1.996ms       1.996ms       0.000us         0.00%      21.631us      21.631us             1  
-                                               aten::to         0.29%       5.801us        85.87%       1.718ms     286.282us       0.000us         0.00%      14.400us       2.400us             6  
-                                         aten::_to_copy         1.16%      23.150us        85.58%       1.712ms     285.315us       0.000us         0.00%      14.400us       2.400us             6  
-                                            aten::copy_         2.46%      49.110us        82.93%       1.659ms     276.491us      12.224us        62.83%      14.400us       2.400us             6  
-                                           aten::conv1d         0.28%       5.690us         5.75%     114.953us      38.318us       0.000us         0.00%       7.231us       2.410us             3  
-                                      aten::convolution         0.48%       9.520us         5.46%     109.263us      36.421us       0.000us         0.00%       7.231us       2.410us             3  
-                                     aten::_convolution         1.10%      21.931us         4.99%      99.743us      33.248us       0.000us         0.00%       7.231us       2.410us             3  
-                                aten::_conv_depthwise2d         1.06%      21.231us         3.12%      62.372us      20.791us       7.231us        37.17%       7.231us       2.410us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.231us        37.17%       7.231us       2.410us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.304us        32.40%       6.304us       2.101us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.920us        30.43%       5.920us       1.973us             3  
-                                Activity Buffer Request        71.98%       1.440ms        71.98%       1.440ms       1.440ms       2.176us        11.18%       2.176us       2.176us             1  
-                                    aten::empty_strided         1.49%      29.791us         1.49%      29.791us       4.965us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         9.66%     193.277us         9.66%     193.277us      21.475us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.86%      17.278us         1.13%      22.539us       2.504us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.45%       9.001us         0.45%       9.001us       0.600us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.46%       9.281us         0.46%       9.281us       3.094us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.43%       8.570us         0.43%       8.570us       2.857us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.29%       5.760us         0.36%       7.200us       2.400us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     330.454us      1698.73%     330.454us     330.454us             1  
+                                            torch_eager        14.50%     115.185us        99.38%     789.290us     789.290us       0.000us         0.00%      21.628us      21.628us             1  
+                                               aten::to         0.75%       5.979us        66.62%     529.132us      88.189us       0.000us         0.00%      14.332us       2.389us             6  
+                                         aten::_to_copy         3.11%      24.732us        65.87%     523.153us      87.192us       0.000us         0.00%      14.332us       2.389us             6  
+                                            aten::copy_         6.75%      53.590us        58.69%     466.101us      77.684us      12.157us        62.49%      14.332us       2.389us             6  
+                                           aten::conv1d         0.72%       5.740us        14.75%     117.122us      39.041us       0.000us         0.00%       7.296us       2.432us             3  
+                                      aten::convolution         1.18%       9.359us        14.02%     111.382us      37.127us       0.000us         0.00%       7.296us       2.432us             3  
+                                     aten::_convolution         2.82%      22.362us        12.85%     102.023us      34.008us       0.000us         0.00%       7.296us       2.432us             3  
+                                aten::_conv_depthwise2d         2.86%      22.741us         8.10%      64.351us      21.450us       7.296us        37.51%       7.296us       2.432us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.296us        37.51%       7.296us       2.432us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.238us        32.07%       6.238us       2.079us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.919us        30.43%       5.919us       1.973us             3  
+                                Activity Buffer Request        30.19%     239.746us        30.19%     239.746us     239.746us       2.175us        11.18%       2.175us       2.175us             1  
+                                    aten::empty_strided         4.07%      32.320us         4.07%      32.320us       5.387us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        24.58%     195.235us        24.58%     195.235us      21.693us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.10%      16.713us         2.76%      21.891us       2.432us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.12%       8.919us         1.12%       8.919us       0.595us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.20%       9.570us         1.20%       9.570us       3.190us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.20%       9.570us         1.20%       9.570us       3.190us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.72%       5.709us         0.89%       7.030us       2.343us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.000ms
-Self CUDA time total: 19.455us
+Self CPU time total: 794.200us
+Self CUDA time total: 19.453us
 
 
 
@@ -4462,29 +4462,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     326.589us      1627.82%     326.589us     326.589us             1  
-                                            torch_eager         7.03%     140.275us        99.72%       1.991ms       1.991ms       0.000us         0.00%      22.207us      22.207us             1  
-                                               aten::to         0.30%       6.010us        85.45%       1.706ms     284.341us       0.000us         0.00%      14.304us       2.384us             6  
-                                         aten::_to_copy         1.18%      23.623us        85.15%       1.700ms     283.340us       0.000us         0.00%      14.304us       2.384us             6  
-                                            aten::copy_         2.42%      48.261us        82.53%       1.648ms     274.613us      12.160us        60.61%      14.304us       2.384us             6  
-                                           aten::conv1d         0.34%       6.690us         5.89%     117.664us      39.221us       0.000us         0.00%       7.903us       2.634us             3  
-                                      aten::convolution         0.46%       9.260us         5.56%     110.974us      36.991us       0.000us         0.00%       7.903us       2.634us             3  
-                                     aten::_convolution         1.15%      23.009us         5.09%     101.714us      33.905us       0.000us         0.00%       7.903us       2.634us             3  
-                                aten::_conv_depthwise2d         1.10%      21.970us         3.15%      62.812us      20.937us       7.903us        39.39%       7.903us       2.634us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.903us        39.39%       7.903us       2.634us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.240us        31.10%       6.240us       2.080us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.920us        29.51%       5.920us       1.973us             3  
-                                Activity Buffer Request        71.49%       1.427ms        71.49%       1.427ms       1.427ms       2.144us        10.69%       2.144us       2.144us             1  
-                                    aten::empty_strided         1.44%      28.740us         1.44%      28.740us       4.790us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         9.68%     193.308us         9.68%     193.308us      21.479us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.85%      16.982us         1.11%      22.224us       2.469us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.45%       8.892us         0.45%       8.892us       0.593us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.47%       9.420us         0.47%       9.420us       3.140us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.51%      10.100us         0.51%      10.100us       3.367us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.31%       6.130us         0.38%       7.650us       2.550us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     325.021us      1622.51%     325.021us     325.021us             1  
+                                            torch_eager        14.95%     114.725us        99.33%     762.279us     762.279us       0.000us         0.00%      22.176us      22.176us             1  
+                                               aten::to         0.78%       5.949us        65.87%     505.530us      84.255us       0.000us         0.00%      14.272us       2.379us             6  
+                                         aten::_to_copy         3.19%      24.509us        65.10%     499.581us      83.264us       0.000us         0.00%      14.272us       2.379us             6  
+                                            aten::copy_         6.59%      50.599us        57.97%     444.890us      74.148us      12.128us        60.54%      14.272us       2.379us             6  
+                                           aten::conv1d         0.79%       6.100us        15.11%     115.973us      38.658us       0.000us         0.00%       7.904us       2.635us             3  
+                                      aten::convolution         1.34%      10.290us        14.32%     109.873us      36.624us       0.000us         0.00%       7.904us       2.635us             3  
+                                     aten::_convolution         2.97%      22.812us        12.98%      99.583us      33.194us       0.000us         0.00%       7.904us       2.635us             3  
+                                aten::_conv_depthwise2d         2.93%      22.501us         8.10%      62.182us      20.727us       7.904us        39.46%       7.904us       2.635us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.904us        39.46%       7.904us       2.635us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.208us        30.99%       6.208us       2.069us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.920us        29.55%       5.920us       1.973us             3  
+                                Activity Buffer Request        28.71%     220.306us        28.71%     220.306us     220.306us       2.144us        10.70%       2.144us       2.144us             1  
+                                    aten::empty_strided         3.93%      30.182us         3.93%      30.182us       5.030us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        25.32%     194.286us        25.32%     194.286us      21.587us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.11%      16.159us         2.76%      21.209us       2.357us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.09%       8.360us         1.09%       8.360us       0.557us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.23%       9.450us         1.23%       9.450us       3.150us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.29%       9.930us         1.29%       9.930us       3.310us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.71%       5.470us         0.87%       6.670us       2.223us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.997ms
-Self CUDA time total: 20.063us
+Self CPU time total: 767.429us
+Self CUDA time total: 20.032us
 
 
 
@@ -4494,29 +4494,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     319.164us       887.36%     319.164us     319.164us             1  
-                                            torch_eager        15.26%     115.785us        99.38%     754.046us     754.046us       0.000us         0.00%      38.560us      38.560us             1  
-                                           aten::conv1d         0.72%       5.471us        14.90%     113.045us      37.682us       0.000us         0.00%      20.097us       6.699us             3  
-                                      aten::convolution         1.25%       9.510us        14.18%     107.574us      35.858us       0.000us         0.00%      20.097us       6.699us             3  
-                                     aten::_convolution         2.85%      21.590us        12.92%      98.064us      32.688us       0.000us         0.00%      20.097us       6.699us             3  
-                                aten::_conv_depthwise2d         2.82%      21.412us         8.06%      61.133us      20.378us      20.097us        55.87%      20.097us       6.699us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      20.097us        55.87%      20.097us       6.699us             3  
-                                               aten::to         0.74%       5.628us        65.55%     497.346us      82.891us       0.000us         0.00%      18.463us       3.077us             6  
-                                         aten::_to_copy         3.02%      22.942us        64.80%     491.718us      81.953us       0.000us         0.00%      18.463us       3.077us             6  
-                                            aten::copy_         6.50%      49.290us        57.91%     439.376us      73.229us      15.871us        44.13%      18.463us       3.077us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.447us        23.48%       8.447us       2.816us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.424us        20.64%       7.424us       2.475us             3  
-                                Activity Buffer Request        28.99%     219.958us        28.99%     219.958us     219.958us       2.592us         7.21%       2.592us       2.592us             1  
-                                    aten::empty_strided         3.87%      29.400us         3.87%      29.400us       4.900us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        25.31%     192.058us        25.31%     192.058us      21.340us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.43%      18.410us         3.09%      23.410us       2.601us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.12%       8.490us         1.12%       8.490us       0.566us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.20%       9.081us         1.20%       9.081us       3.027us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.15%       8.710us         1.15%       8.710us       2.903us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.77%       5.871us         0.96%       7.301us       2.434us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     356.764us       983.15%     356.764us     356.764us             1  
+                                            torch_eager        15.53%     123.844us        99.36%     792.350us     792.350us       0.000us         0.00%      38.944us      38.944us             1  
+                                           aten::conv1d         0.79%       6.320us        15.33%     122.233us      40.744us       0.000us         0.00%      20.320us       6.773us             3  
+                                      aten::convolution         1.24%       9.851us        14.54%     115.913us      38.638us       0.000us         0.00%      20.320us       6.773us             3  
+                                     aten::_convolution         2.89%      23.052us        13.30%     106.062us      35.354us       0.000us         0.00%      20.320us       6.773us             3  
+                                aten::_conv_depthwise2d         2.97%      23.692us         8.39%      66.891us      22.297us      20.320us        56.00%      20.320us       6.773us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      20.320us        56.00%      20.320us       6.773us             3  
+                                               aten::to         0.80%       6.349us        64.76%     516.391us      86.065us       0.000us         0.00%      18.624us       3.104us             6  
+                                         aten::_to_copy         3.21%      25.572us        63.96%     510.042us      85.007us       0.000us         0.00%      18.624us       3.104us             6  
+                                            aten::copy_         6.54%      52.120us        56.52%     450.739us      75.123us      15.968us        44.00%      18.624us       3.104us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.607us        23.72%       8.607us       2.869us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.361us        20.28%       7.361us       2.454us             3  
+                                Activity Buffer Request        27.46%     218.966us        27.46%     218.966us     218.966us       2.656us         7.32%       2.656us       2.656us             1  
+                                    aten::empty_strided         4.23%      33.731us         4.23%      33.731us       5.622us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        25.38%     202.413us        25.38%     202.413us      22.490us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.20%      17.520us         2.88%      22.939us       2.549us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.21%       9.679us         1.21%       9.679us       0.645us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.40%      11.140us         1.40%      11.140us       3.713us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.17%       9.299us         1.17%       9.299us       3.100us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.75%       6.010us         0.93%       7.450us       2.483us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 758.766us
-Self CUDA time total: 35.968us
+Self CPU time total: 797.430us
+Self CUDA time total: 36.288us
 
 
 
@@ -4526,29 +4526,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     318.166us       839.07%     318.166us     318.166us             1  
-                                            torch_eager        15.61%     115.614us        99.23%     735.056us     735.056us       0.000us         0.00%      40.512us      40.512us             1  
-                                           aten::conv1d         0.77%       5.689us        15.23%     112.833us      37.611us       0.000us         0.00%      22.206us       7.402us             3  
-                                      aten::convolution         1.28%       9.450us        14.46%     107.144us      35.715us       0.000us         0.00%      22.206us       7.402us             3  
-                                     aten::_convolution         2.90%      21.450us        13.19%      97.694us      32.565us       0.000us         0.00%      22.206us       7.402us             3  
-                                aten::_conv_depthwise2d         2.86%      21.190us         8.15%      60.352us      20.117us      22.206us        58.56%      22.206us       7.402us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      22.206us        58.56%      22.206us       7.402us             3  
-                                               aten::to         0.76%       5.621us        64.62%     478.657us      79.776us       0.000us         0.00%      18.306us       3.051us             6  
-                                         aten::_to_copy         3.14%      23.241us        63.86%     473.036us      78.839us       0.000us         0.00%      18.306us       3.051us             6  
-                                            aten::copy_         6.66%      49.364us        56.82%     420.865us      70.144us      15.713us        41.44%      18.306us       3.051us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.385us        22.11%       8.385us       2.795us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.328us        19.33%       7.328us       2.443us             3  
-                                Activity Buffer Request        27.11%     200.816us        27.11%     200.816us     200.816us       2.593us         6.84%       2.593us       2.593us             1  
-                                    aten::empty_strided         3.91%      28.930us         3.91%      28.930us       4.822us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        25.94%     192.117us        25.94%     192.117us      21.346us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.42%      17.932us         3.14%      23.222us       2.580us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.19%       8.781us         1.19%       8.781us       0.585us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.25%       9.270us         1.25%       9.270us       3.090us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.14%       8.460us         1.14%       8.460us       2.820us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.85%       6.280us         1.02%       7.591us       2.530us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     332.353us       866.25%     332.353us     332.353us             1  
+                                            torch_eager         6.20%     124.083us        99.73%       1.997ms       1.997ms       0.000us         0.00%      40.959us      40.959us             1  
+                                           aten::conv1d         0.30%       6.071us         5.74%     115.013us      38.338us       0.000us         0.00%      22.592us       7.531us             3  
+                                      aten::convolution         0.48%       9.660us         5.44%     108.942us      36.314us       0.000us         0.00%      22.592us       7.531us             3  
+                                     aten::_convolution         1.09%      21.840us         4.96%      99.282us      33.094us       0.000us         0.00%      22.592us       7.531us             3  
+                                aten::_conv_depthwise2d         1.15%      22.991us         3.11%      62.342us      20.781us      22.592us        58.88%      22.592us       7.531us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      22.592us        58.88%      22.592us       7.531us             3  
+                                               aten::to         0.32%       6.339us        86.44%       1.731ms     288.505us       0.000us         0.00%      18.367us       3.061us             6  
+                                         aten::_to_copy         1.25%      24.980us        86.12%       1.725ms     287.449us       0.000us         0.00%      18.367us       3.061us             6  
+                                            aten::copy_         2.51%      50.252us        83.36%       1.669ms     278.222us      15.775us        41.12%      18.367us       3.061us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.416us        21.94%       8.416us       2.805us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.359us        19.18%       7.359us       2.453us             3  
+                                Activity Buffer Request        72.13%       1.445ms        72.13%       1.445ms       1.445ms       2.592us         6.76%       2.592us       2.592us             1  
+                                    aten::empty_strided         1.52%      30.382us         1.52%      30.382us       5.064us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         9.74%     194.985us         9.74%     194.985us      21.665us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.87%      17.330us         1.13%      22.630us       2.514us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.45%       8.941us         0.45%       8.941us       0.596us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.48%       9.610us         0.48%       9.610us       3.203us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.46%       9.250us         0.46%       9.250us       3.083us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.27%       5.490us         0.34%       6.780us       2.260us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 740.726us
-Self CUDA time total: 37.919us
+Self CPU time total: 2.003ms
+Self CUDA time total: 38.367us
 
 
 
@@ -4558,29 +4558,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     321.366us       502.64%     321.366us     321.366us             1  
-                                            torch_eager        15.27%     113.396us        99.28%     737.126us     737.126us       0.000us         0.00%      68.031us      68.031us             1  
-                                           aten::conv1d         0.76%       5.670us        15.56%     115.503us      38.501us       0.000us         0.00%      41.567us      13.856us             3  
-                                      aten::convolution         1.28%       9.489us        14.79%     109.833us      36.611us       0.000us         0.00%      41.567us      13.856us             3  
-                                     aten::_convolution         3.08%      22.850us        13.52%     100.344us      33.448us       0.000us         0.00%      41.567us      13.856us             3  
-                                aten::_conv_depthwise2d         2.89%      21.483us         8.27%      61.383us      20.461us      41.567us        65.01%      41.567us      13.856us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      41.567us        65.01%      41.567us      13.856us             3  
-                                               aten::to         0.76%       5.660us        64.85%     481.506us      80.251us       0.000us         0.00%      26.464us       4.411us             6  
-                                         aten::_to_copy         3.08%      22.842us        64.09%     475.846us      79.308us       0.000us         0.00%      26.464us       4.411us             6  
-                                            aten::copy_         6.57%      48.752us        57.01%     423.304us      70.551us      22.368us        34.99%      26.464us       4.411us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      11.968us        18.72%      11.968us       3.989us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.400us        16.27%      10.400us       3.467us             3  
-                                Activity Buffer Request        27.27%     202.487us        27.27%     202.487us     202.487us       4.096us         6.41%       4.096us       4.096us             1  
-                                    aten::empty_strided         4.00%      29.700us         4.00%      29.700us       4.950us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        26.15%     194.125us        26.15%     194.125us      21.569us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.30%      17.061us         2.99%      22.191us       2.466us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.19%       8.800us         1.19%       8.800us       0.587us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.25%       9.280us         1.25%       9.280us       3.093us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.15%       8.560us         1.15%       8.560us       2.853us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.77%       5.741us         0.96%       7.151us       2.384us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     328.952us       509.17%     328.952us     328.952us             1  
+                                            torch_eager        15.31%     114.903us        99.32%     745.599us     745.599us       0.000us         0.00%      68.701us      68.701us             1  
+                                           aten::conv1d         0.89%       6.660us        15.50%     116.373us      38.791us       0.000us         0.00%      42.238us      14.079us             3  
+                                      aten::convolution         1.33%       9.952us        14.61%     109.713us      36.571us       0.000us         0.00%      42.238us      14.079us             3  
+                                     aten::_convolution         2.95%      22.149us        13.29%      99.761us      33.254us       0.000us         0.00%      42.238us      14.079us             3  
+                                aten::_conv_depthwise2d         2.94%      22.090us         8.38%      62.891us      20.964us      42.238us        65.38%      42.238us      14.079us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      42.238us        65.38%      42.238us      14.079us             3  
+                                               aten::to         0.80%       6.039us        65.05%     488.341us      81.390us       0.000us         0.00%      26.463us       4.410us             6  
+                                         aten::_to_copy         3.23%      24.281us        64.25%     482.302us      80.384us       0.000us         0.00%      26.463us       4.410us             6  
+                                            aten::copy_         6.57%      49.302us        56.69%     425.561us      70.927us      22.367us        34.62%      26.463us       4.410us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      11.936us        18.48%      11.936us       3.979us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.431us        16.15%      10.431us       3.477us             3  
+                                Activity Buffer Request        26.58%     199.565us        26.58%     199.565us     199.565us       4.096us         6.34%       4.096us       4.096us             1  
+                                    aten::empty_strided         4.32%      32.460us         4.32%      32.460us       5.410us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        26.45%     198.565us        26.45%     198.565us      22.063us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.13%      16.001us         2.81%      21.091us       2.343us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.16%       8.690us         1.16%       8.690us       0.579us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.26%       9.490us         1.26%       9.490us       3.163us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.26%       9.440us         1.26%       9.440us       3.147us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.75%       5.611us         0.93%       6.981us       2.327us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 742.446us
-Self CUDA time total: 63.935us
+Self CPU time total: 750.709us
+Self CUDA time total: 64.605us
 
 
 
@@ -4590,29 +4590,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     326.264us       468.36%     326.264us     326.264us             1  
-                                            torch_eager        14.61%     117.663us        99.38%     800.347us     800.347us       0.000us         0.00%      73.789us      73.789us             1  
-                                           aten::conv1d         0.75%       6.020us        14.38%     115.844us      38.615us       0.000us         0.00%      47.230us      15.743us             3  
-                                      aten::convolution         1.16%       9.351us        13.64%     109.824us      36.608us       0.000us         0.00%      47.230us      15.743us             3  
-                                     aten::_convolution         2.76%      22.250us        12.48%     100.473us      33.491us       0.000us         0.00%      47.230us      15.743us             3  
-                                aten::_conv_depthwise2d         2.71%      21.790us         7.76%      62.461us      20.820us      47.230us        67.80%      47.230us      15.743us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      47.230us        67.80%      47.230us      15.743us             3  
-                                               aten::to         0.71%       5.690us        66.94%     539.059us      89.843us       0.000us         0.00%      26.559us       4.426us             6  
-                                         aten::_to_copy         2.87%      23.082us        66.23%     533.369us      88.895us       0.000us         0.00%      26.559us       4.426us             6  
-                                            aten::copy_         6.12%      49.260us        59.73%     480.976us      80.163us      22.431us        32.20%      26.559us       4.426us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        17.23%      12.000us       4.000us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.431us        14.97%      10.431us       3.477us             3  
-                                Activity Buffer Request        29.99%     241.509us        29.99%     241.509us     241.509us       4.128us         5.93%       4.128us       4.128us             1  
-                                    aten::empty_strided         3.64%      29.311us         3.64%      29.311us       4.885us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        26.37%     212.348us        26.37%     212.348us      23.594us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.22%      17.841us         2.86%      23.041us       2.560us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.09%       8.761us         1.09%       8.761us       0.584us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.16%       9.320us         1.16%       9.320us       3.107us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.14%       9.210us         1.14%       9.210us       3.070us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.77%       6.201us         0.95%       7.621us       2.540us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     328.798us       467.68%     328.798us     328.798us             1  
+                                            torch_eager        14.69%     115.264us        99.37%     779.669us     779.669us       0.000us         0.00%      74.432us      74.432us             1  
+                                           aten::conv1d         0.75%       5.869us        14.89%     116.853us      38.951us       0.000us         0.00%      47.840us      15.947us             3  
+                                      aten::convolution         1.20%       9.412us        14.15%     110.984us      36.995us       0.000us         0.00%      47.840us      15.947us             3  
+                                     aten::_convolution         2.99%      23.451us        12.95%     101.572us      33.857us       0.000us         0.00%      47.840us      15.947us             3  
+                                aten::_conv_depthwise2d         2.71%      21.281us         8.10%      63.532us      21.177us      47.840us        68.05%      47.840us      15.947us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      47.840us        68.05%      47.840us      15.947us             3  
+                                               aten::to         0.74%       5.828us        66.46%     521.411us      86.902us       0.000us         0.00%      26.592us       4.432us             6  
+                                         aten::_to_copy         3.27%      25.622us        65.71%     515.583us      85.931us       0.000us         0.00%      26.592us       4.432us             6  
+                                            aten::copy_         6.42%      50.382us        58.46%     458.651us      76.442us      22.464us        31.95%      26.592us       4.432us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      12.032us        17.11%      12.032us       4.011us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.432us        14.84%      10.432us       3.477us             3  
+                                Activity Buffer Request        29.93%     234.846us        29.93%     234.846us     234.846us       4.128us         5.87%       4.128us       4.128us             1  
+                                    aten::empty_strided         3.99%      31.310us         3.99%      31.310us       5.218us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        24.83%     194.803us        24.83%     194.803us      21.645us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.07%      16.243us         2.72%      21.332us       2.370us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.07%       8.401us         1.07%       8.401us       0.560us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.35%      10.581us         1.35%      10.581us       3.527us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.31%      10.290us         1.31%      10.290us       3.430us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.69%       5.406us         0.84%       6.568us       2.189us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 805.317us
-Self CUDA time total: 69.661us
+Self CPU time total: 784.589us
+Self CUDA time total: 70.304us
 
 
 
@@ -4622,29 +4622,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     372.509us       200.60%     372.509us     372.509us             1  
-                                            torch_eager        16.32%     136.903us        99.36%     833.418us     833.418us       0.000us         0.00%     195.711us     195.711us             1  
-                                           aten::conv1d         0.67%       5.580us        15.45%     129.615us      43.205us       0.000us         0.00%     133.247us      44.416us             3  
-                                      aten::convolution         1.13%       9.510us        14.79%     124.035us      41.345us       0.000us         0.00%     133.247us      44.416us             3  
-                                     aten::_convolution         3.89%      32.633us        13.65%     114.525us      38.175us       0.000us         0.00%     133.247us      44.416us             3  
-                                aten::_conv_depthwise2d         2.50%      20.960us         7.87%      66.022us      22.007us     133.247us        71.76%     133.247us      44.416us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     133.247us        71.76%     133.247us      44.416us             3  
-                                               aten::to         0.72%       6.039us        64.27%     539.099us      89.850us       0.000us         0.00%      62.464us      10.411us             6  
-                                         aten::_to_copy         2.75%      23.094us        63.55%     533.060us      88.843us       0.000us         0.00%      62.464us      10.411us             6  
-                                            aten::copy_         5.97%      50.071us        57.15%     479.385us      79.897us      52.448us        28.24%      62.464us      10.411us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      29.504us        15.89%      29.504us       9.835us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.944us        12.36%      22.944us       7.648us             3  
-                                Activity Buffer Request        30.64%     256.969us        30.64%     256.969us     256.969us      10.016us         5.39%      10.016us      10.016us             1  
-                                    aten::empty_strided         3.65%      30.581us         3.65%      30.581us       5.097us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        23.59%     197.827us        23.59%     197.827us      21.981us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.16%      18.130us         2.81%      23.610us       2.623us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.09%       9.169us         1.09%       9.169us       0.611us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.19%       9.940us         1.19%       9.940us       3.313us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.15%       9.640us         1.15%       9.640us       3.213us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.72%       6.001us         0.89%       7.490us       2.497us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     341.882us       182.91%     341.882us     341.882us             1  
+                                            torch_eager        15.14%     117.185us        99.33%     768.879us     768.879us       0.000us         0.00%     197.117us     197.117us             1  
+                                           aten::conv1d         0.79%       6.110us        14.86%     114.993us      38.331us       0.000us         0.00%     134.270us      44.757us             3  
+                                      aten::convolution         1.22%       9.451us        14.07%     108.883us      36.294us       0.000us         0.00%     134.270us      44.757us             3  
+                                     aten::_convolution         2.87%      22.240us        12.85%      99.432us      33.144us       0.000us         0.00%     134.270us      44.757us             3  
+                                aten::_conv_depthwise2d         2.84%      21.991us         8.04%      62.222us      20.741us     134.270us        71.84%     134.270us      44.757us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     134.270us        71.84%     134.270us      44.757us             3  
+                                               aten::to         0.77%       5.950us        65.77%     509.102us      84.850us       0.000us         0.00%      62.847us      10.474us             6  
+                                         aten::_to_copy         3.29%      25.489us        65.00%     503.152us      83.859us       0.000us         0.00%      62.847us      10.474us             6  
+                                            aten::copy_         6.45%      49.889us        57.58%     445.721us      74.287us      52.639us        28.16%      62.847us      10.474us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      29.728us        15.91%      29.728us       9.909us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.911us        12.26%      22.911us       7.637us             3  
+                                Activity Buffer Request        28.61%     221.416us        28.61%     221.416us     221.416us      10.208us         5.46%      10.208us      10.208us             1  
+                                    aten::empty_strided         4.13%      31.942us         4.13%      31.942us       5.324us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        25.24%     195.386us        25.24%     195.386us      21.710us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.14%      16.602us         2.90%      22.460us       2.496us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.19%       9.247us         1.19%       9.247us       0.616us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.23%       9.500us         1.23%       9.500us       3.167us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.26%       9.761us         1.26%       9.761us       3.254us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.71%       5.470us         0.87%       6.700us       2.233us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 838.778us
-Self CUDA time total: 185.695us
+Self CPU time total: 774.039us
+Self CUDA time total: 186.909us
 
 
 
@@ -4654,29 +4654,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     368.701us       175.32%     368.701us     368.701us             1  
-                                            torch_eager        16.38%     138.724us        99.39%     841.559us     841.559us       0.000us         0.00%     224.383us     224.383us             1  
-                                           aten::conv1d         0.69%       5.870us        14.05%     118.945us      39.648us       0.000us         0.00%     154.015us      51.338us             3  
-                                      aten::convolution         1.19%      10.050us        13.35%     113.075us      37.692us       0.000us         0.00%     154.015us      51.338us             3  
-                                     aten::_convolution         2.68%      22.669us        12.17%     103.025us      34.342us       0.000us         0.00%     154.015us      51.338us             3  
-                                aten::_conv_depthwise2d         2.54%      21.472us         7.66%      64.883us      21.628us     154.015us        73.23%     154.015us      51.338us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     154.015us        73.23%     154.015us      51.338us             3  
-                                               aten::to         0.70%       5.911us        65.49%     554.540us      92.423us       0.000us         0.00%      70.368us      11.728us             6  
-                                         aten::_to_copy         2.70%      22.862us        64.79%     548.629us      91.438us       0.000us         0.00%      70.368us      11.728us             6  
-                                            aten::copy_         5.97%      50.511us        58.49%     495.276us      82.546us      56.288us        26.77%      70.368us      11.728us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      33.248us        15.81%      33.248us      11.083us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      23.040us        10.96%      23.040us       7.680us             3  
-                                Activity Buffer Request        32.21%     272.739us        32.21%     272.739us     272.739us      14.080us         6.70%      14.080us      14.080us             1  
-                                    aten::empty_strided         3.60%      30.491us         3.60%      30.491us       5.082us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        23.06%     195.277us        23.06%     195.277us      21.697us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.26%      19.134us         2.91%      24.623us       2.736us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.07%       9.019us         1.07%       9.019us       0.601us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.15%       9.700us         1.15%       9.700us       3.233us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.24%      10.460us         1.24%      10.460us       3.487us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.68%       5.760us         0.85%       7.180us       2.393us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     349.277us       165.88%     349.277us     349.277us             1  
+                                            torch_eager        15.39%     117.165us        99.36%     756.609us     756.609us       0.000us         0.00%     224.029us     224.029us             1  
+                                           aten::conv1d         0.74%       5.661us        15.33%     116.734us      38.911us       0.000us         0.00%     154.686us      51.562us             3  
+                                      aten::convolution         1.20%       9.150us        14.59%     111.073us      37.024us       0.000us         0.00%     154.686us      51.562us             3  
+                                     aten::_convolution         2.96%      22.532us        13.38%     101.923us      33.974us       0.000us         0.00%     154.686us      51.562us             3  
+                                aten::_conv_depthwise2d         2.86%      21.751us         8.47%      64.492us      21.497us     154.686us        73.47%     154.686us      51.562us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     154.686us        73.47%     154.686us      51.562us             3  
+                                               aten::to         0.84%       6.379us        65.15%     496.150us      82.692us       0.000us         0.00%      69.343us      11.557us             6  
+                                         aten::_to_copy         3.33%      25.371us        64.32%     489.771us      81.628us       0.000us         0.00%      69.343us      11.557us             6  
+                                            aten::copy_         6.44%      49.031us        56.76%     432.240us      72.040us      55.871us        26.53%      69.343us      11.557us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      32.831us        15.59%      32.831us      10.944us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      23.040us        10.94%      23.040us       7.680us             3  
+                                Activity Buffer Request        27.33%     208.145us        27.33%     208.145us     208.145us      13.472us         6.40%      13.472us      13.472us             1  
+                                    aten::empty_strided         4.22%      32.160us         4.22%      32.160us       5.360us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        25.87%     197.025us        25.87%     197.025us      21.892us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.14%      16.329us         2.83%      21.520us       2.391us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.17%       8.932us         1.17%       8.932us       0.595us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.38%      10.500us         1.38%      10.500us       3.500us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.35%      10.280us         1.35%      10.280us       3.427us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.72%       5.468us         0.90%       6.839us       2.280us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 846.749us
-Self CUDA time total: 210.303us
+Self CPU time total: 761.499us
+Self CUDA time total: 210.557us
 
 
 
@@ -4686,29 +4686,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         6.86%     124.525us        53.03%     963.064us     963.064us       0.000us         0.00%       1.524ms       1.524ms             1  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.423ms       100.42%       1.423ms       1.423ms             1  
-                                               aten::to         0.37%       6.781us        38.11%     692.105us     115.351us       0.000us         0.00%     827.798us     137.966us             6  
-                                         aten::_to_copy         1.62%      29.329us        37.74%     685.324us     114.221us       0.000us         0.00%     827.798us     137.966us             6  
-                                            aten::copy_         2.86%      52.014us        24.74%     449.228us      74.871us     721.111us        50.87%     827.798us     137.966us             6  
-                                           aten::conv1d         0.32%       5.800us         6.51%     118.154us      39.385us       0.000us         0.00%     696.313us     232.104us             3  
-                                      aten::convolution         0.55%       9.981us         6.19%     112.354us      37.451us       0.000us         0.00%     696.313us     232.104us             3  
-                                     aten::_convolution         1.25%      22.722us         5.64%     102.373us      34.124us       0.000us         0.00%     696.313us     232.104us             3  
-                                aten::_conv_depthwise2d         1.22%      22.241us         3.54%      64.332us      21.444us     696.313us        49.13%     696.313us     232.104us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     696.313us        49.13%     696.313us     232.104us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     411.194us        29.01%     411.194us     137.065us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     309.917us        21.86%     309.917us     103.306us             3  
-                                Activity Buffer Request        12.02%     218.207us        12.02%     218.207us     218.207us     106.687us         7.53%     106.687us     106.687us             1  
-                                    aten::empty_strided         1.97%      35.692us        11.39%     206.767us      34.461us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        11.11%     201.717us        11.11%     201.717us      22.413us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.96%      17.369us         1.26%      22.889us       2.543us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.51%       9.249us         0.51%       9.249us       0.617us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.50%       9.061us         0.50%       9.061us       3.020us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.57%      10.320us         0.57%      10.320us       3.440us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.33%       5.990us         0.41%       7.360us       2.453us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         6.72%     121.944us        52.58%     953.714us     953.714us       0.000us         0.00%       1.521ms       1.521ms             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.421ms       100.41%       1.421ms       1.421ms             1  
+                                               aten::to         0.35%       6.300us        37.63%     682.555us     113.759us       0.000us         0.00%     824.097us     137.350us             6  
+                                         aten::_to_copy         1.68%      30.549us        37.28%     676.255us     112.709us       0.000us         0.00%     824.097us     137.350us             6  
+                                            aten::copy_         2.98%      53.981us        24.83%     450.422us      75.070us     718.817us        50.79%     824.097us     137.350us             6  
+                                           aten::conv1d         0.35%       6.281us         6.65%     120.554us      40.185us       0.000us         0.00%     696.543us     232.181us             3  
+                                      aten::convolution         0.57%      10.251us         6.30%     114.273us      38.091us       0.000us         0.00%     696.543us     232.181us             3  
+                                     aten::_convolution         1.27%      23.111us         5.73%     104.022us      34.674us       0.000us         0.00%     696.543us     232.181us             3  
+                                aten::_conv_depthwise2d         1.23%      22.359us         3.60%      65.321us      21.774us     696.543us        49.21%     696.543us     232.181us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     696.543us        49.21%     696.543us     232.181us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     409.920us        28.96%     409.920us     136.640us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     308.897us        21.82%     308.897us     102.966us             3  
+                                Activity Buffer Request        11.98%     217.246us        11.98%     217.246us     217.246us     105.280us         7.44%     105.280us     105.280us             1  
+                                    aten::empty_strided         2.17%      39.370us        10.77%     195.284us      32.547us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        11.13%     201.976us        11.13%     201.976us      22.442us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.99%      18.030us         1.31%      23.761us       2.640us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.53%       9.620us         0.53%       9.620us       0.641us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.59%      10.751us         0.59%      10.751us       3.584us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.52%       9.430us         0.52%       9.430us       3.143us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.31%       5.670us         0.39%       7.030us       2.343us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.816ms
-Self CUDA time total: 1.417ms
+Self CPU time total: 1.814ms
+Self CUDA time total: 1.415ms
 
 
 
@@ -4718,33 +4718,33 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         6.33%     114.706us        41.01%     743.286us     743.286us       0.000us         0.00%       1.500ms       1.500ms             1  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.431ms       100.39%       1.431ms       1.431ms             1  
-                                               aten::to         0.32%       5.881us        26.81%     485.936us      80.989us       0.000us         0.00%     762.577us     127.096us             6  
-                                         aten::_to_copy         1.28%      23.109us        26.49%     480.055us      80.009us       0.000us         0.00%     762.577us     127.096us             6  
-                                            aten::copy_         2.74%      49.733us        23.67%     429.056us      71.509us     687.698us        48.25%     762.577us     127.096us             6  
-                                           aten::conv1d         0.31%       5.590us         6.38%     115.623us      38.541us       0.000us         0.00%     737.523us     245.841us             3  
-                                      aten::convolution         0.55%       9.990us         6.07%     110.033us      36.678us       0.000us         0.00%     737.523us     245.841us             3  
-                                     aten::_convolution         1.21%      21.900us         5.52%     100.043us      33.348us       0.000us         0.00%     737.523us     245.841us             3  
-                                aten::_conv_depthwise2d         1.16%      21.072us         3.45%      62.453us      20.818us     737.523us        51.75%     737.523us     245.841us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     737.523us        51.75%     737.523us     245.841us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     400.247us        28.08%     400.247us     133.416us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     287.451us        20.17%     287.451us      95.817us             3  
-                                Activity Buffer Request        11.32%     205.227us        11.32%     205.227us     205.227us      74.879us         5.25%      74.879us      74.879us             1  
-                                    aten::empty_strided         1.54%      27.890us         1.54%      27.890us       4.648us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        10.89%     197.296us        10.89%     197.296us      21.922us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.95%      17.181us         1.23%      22.321us       2.480us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.49%       8.961us         0.49%       8.961us       0.597us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.50%       9.050us         0.50%       9.050us       3.017us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.50%       9.131us         0.50%       9.131us       3.044us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.32%       5.870us         0.41%       7.390us       2.463us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         4.05%     123.714us        65.96%       2.016ms       2.016ms       0.000us         0.00%       1.502ms       1.502ms             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.433ms       100.43%       1.433ms       1.433ms             1  
+                                               aten::to         0.21%       6.507us        56.82%       1.737ms     289.475us       0.000us         0.00%     764.927us     127.488us             6  
+                                         aten::_to_copy         0.85%      25.961us        56.61%       1.730ms     288.391us       0.000us         0.00%     764.927us     127.488us             6  
+                                            aten::copy_         1.76%      53.800us        54.73%       1.673ms     278.832us     689.887us        48.36%     764.927us     127.488us             6  
+                                           aten::conv1d         0.20%       6.220us         4.18%     127.663us      42.554us       0.000us         0.00%     736.735us     245.578us             3  
+                                      aten::convolution         0.34%      10.420us         3.97%     121.443us      40.481us       0.000us         0.00%     736.735us     245.578us             3  
+                                     aten::_convolution         0.75%      22.860us         3.63%     111.023us      37.008us       0.000us         0.00%     736.735us     245.578us             3  
+                                aten::_conv_depthwise2d         0.96%      29.441us         2.37%      72.583us      24.194us     736.735us        51.64%     736.735us     245.578us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     736.735us        51.64%     736.735us     245.578us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     397.471us        27.86%     397.471us     132.490us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     292.416us        20.50%     292.416us      97.472us             3  
+                                Activity Buffer Request        47.26%       1.445ms        47.26%       1.445ms       1.445ms      75.040us         5.26%      75.040us      75.040us             1  
+                                    aten::empty_strided         1.03%      31.391us         1.03%      31.391us       5.232us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         6.45%     197.169us         6.45%     197.169us      21.908us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.57%      17.300us         0.75%      22.850us       2.539us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.30%       9.200us         0.30%       9.200us       0.613us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.32%       9.780us         0.32%       9.780us       3.260us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.36%      10.870us         0.36%      10.870us       3.623us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.19%       5.770us         0.23%       7.180us       2.393us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.812ms
-Self CUDA time total: 1.425ms
+Self CPU time total: 3.057ms
+Self CUDA time total: 1.427ms
 
 
 impl                     wl                  p50(ms)  ok
-torch_eager              cuda_B2_D2048_S128_W2     0.09  True
+torch_eager              cuda_B2_D2048_S128_W2     0.08  True
 torch_eager              cuda_B2_D2048_S128_W4     0.08  True
 torch_eager              cuda_B2_D2048_S2048_W2     0.15  True
 torch_eager              cuda_B2_D2048_S2048_W4     0.16  True
@@ -4752,7 +4752,7 @@ torch_eager              cuda_B2_D2048_S512_W2     0.08  True
 torch_eager              cuda_B2_D2048_S512_W4     0.08  True
 torch_eager              cuda_B2_D64_S128_W2     0.07  True
 torch_eager              cuda_B2_D64_S128_W4     0.09  True
-torch_eager              cuda_B2_D64_S2048_W2     0.09  True
+torch_eager              cuda_B2_D64_S2048_W2     0.08  True
 torch_eager              cuda_B2_D64_S2048_W4     0.08  True
 torch_eager              cuda_B2_D64_S512_W2     0.09  True
 torch_eager              cuda_B2_D64_S512_W4     0.08  True
@@ -4765,10 +4765,16 @@ torch_eager              cuda_B4_D2048_S512_W4     0.10  True
 torch_eager              cuda_B4_D64_S128_W2     0.08  True
 torch_eager              cuda_B4_D64_S128_W4     0.08  True
 torch_eager              cuda_B4_D64_S2048_W2     0.08  True
-torch_eager              cuda_B4_D64_S2048_W4     0.08  True
+torch_eager              cuda_B4_D64_S2048_W4     0.09  True
 torch_eager              cuda_B4_D64_S512_W2     0.08  True
 torch_eager              cuda_B4_D64_S512_W4     0.08  True
 
+
+
▶ UV Install Logs
+ +

Artifacts:

causal_conv1d.jsonl diff --git a/causal_conv1d/results/artifacts/combine/latency.svg b/causal_conv1d/results/artifacts/combine/latency.svg index e336babd9e22036016f034e9655aa303d520c536..07cfbdf7d6b5520fa7d67c8819a8378d9bcd8cb5 100644 --- a/causal_conv1d/results/artifacts/combine/latency.svg +++ b/causal_conv1d/results/artifacts/combine/latency.svg @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a640783c4d5cb4dc1763b97fa9a3e0cf2d278599a3fc38ba2056846c760ec8fe -size 35421 +oid sha256:3d92f3a3aa92e11f21958cf1c591a4e709fd40f7b0cccbd544c1e1a77b11bcd2 +size 35429 diff --git a/causal_conv1d/results/combined_results.html b/causal_conv1d/results/combined_results.html index dcc52b58db96b72ed197292d2ffb66bacd9bf72c..45b22fabef9b9c6a15964465834db2598fd9e481 100644 --- a/causal_conv1d/results/combined_results.html +++ b/causal_conv1d/results/combined_results.html @@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: - 2025-10-28T14:09:26.231666 + 2025-10-29T14:27:58.771179 image/svg+xml @@ -4216,70 +4216,70 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: - + - + - 0.1 + 0.1 - + - + - 0.2 + 0.2 - + - + - 0.3 + 0.3 - + - + - 0.4 + 0.4 - + - + - 0.5 + 0.5 @@ -4287,66 +4287,66 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: - + - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + - + - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + @@ -4405,7 +4405,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: combine | 4.38s +Cell: combine | 4.32s | Raw @@ -4499,11 +4499,11 @@ hf_kernels_causal_conv1d cuda_B2_D2048_S2048_W4 0.05 True hf_kernels_causal_conv1d cuda_B2_D2048_S512_W2 0.05 True hf_kernels_causal_conv1d cuda_B2_D2048_S512_W4 0.05 True hf_kernels_causal_conv1d cuda_B2_D64_S128_W2 0.05 True -hf_kernels_causal_conv1d cuda_B2_D64_S128_W4 0.05 True -hf_kernels_causal_conv1d cuda_B2_D64_S2048_W2 0.05 True +hf_kernels_causal_conv1d cuda_B2_D64_S128_W4 0.06 True +hf_kernels_causal_conv1d cuda_B2_D64_S2048_W2 0.06 True hf_kernels_causal_conv1d cuda_B2_D64_S2048_W4 0.05 True -hf_kernels_causal_conv1d cuda_B2_D64_S512_W2 0.05 True -hf_kernels_causal_conv1d cuda_B2_D64_S512_W4 0.05 True +hf_kernels_causal_conv1d cuda_B2_D64_S512_W2 0.06 True +hf_kernels_causal_conv1d cuda_B2_D64_S512_W4 0.06 True hf_kernels_causal_conv1d cuda_B4_D2048_S128_W2 0.05 True hf_kernels_causal_conv1d cuda_B4_D2048_S128_W4 0.05 True hf_kernels_causal_conv1d cuda_B4_D2048_S2048_W2 0.05 True @@ -4514,9 +4514,9 @@ hf_kernels_causal_conv1d cuda_B4_D64_S128_W2 0.05 True hf_kernels_causal_conv1d cuda_B4_D64_S128_W4 0.05 True hf_kernels_causal_conv1d cuda_B4_D64_S2048_W2 0.05 True hf_kernels_causal_conv1d cuda_B4_D64_S2048_W4 0.05 True -hf_kernels_causal_conv1d cuda_B4_D64_S512_W2 0.05 True +hf_kernels_causal_conv1d cuda_B4_D64_S512_W2 0.06 True hf_kernels_causal_conv1d cuda_B4_D64_S512_W4 0.05 True -torch_eager cuda_B2_D2048_S128_W2 0.09 True +torch_eager cuda_B2_D2048_S128_W2 0.08 True torch_eager cuda_B2_D2048_S128_W4 0.08 True torch_eager cuda_B2_D2048_S2048_W2 0.15 True torch_eager cuda_B2_D2048_S2048_W4 0.16 True @@ -4524,7 +4524,7 @@ torch_eager cuda_B2_D2048_S512_W2 0.08 True torch_eager cuda_B2_D2048_S512_W4 0.08 True torch_eager cuda_B2_D64_S128_W2 0.07 True torch_eager cuda_B2_D64_S128_W4 0.09 True -torch_eager cuda_B2_D64_S2048_W2 0.09 True +torch_eager cuda_B2_D64_S2048_W2 0.08 True torch_eager cuda_B2_D64_S2048_W4 0.08 True torch_eager cuda_B2_D64_S512_W2 0.09 True torch_eager cuda_B2_D64_S512_W4 0.08 True @@ -4537,7 +4537,7 @@ torch_eager cuda_B4_D2048_S512_W4 0.10 True torch_eager cuda_B4_D64_S128_W2 0.08 True torch_eager cuda_B4_D64_S128_W4 0.08 True torch_eager cuda_B4_D64_S2048_W2 0.08 True -torch_eager cuda_B4_D64_S2048_W4 0.08 True +torch_eager cuda_B4_D64_S2048_W4 0.09 True torch_eager cuda_B4_D64_S512_W2 0.08 True torch_eager cuda_B4_D64_S512_W4 0.08 True @@ -4559,7 +4559,7 @@ Implementations included:
▶ UV Install Logs
@@ -4572,7 +4572,7 @@ Installed 37 packages in 221ms - 2025-10-28T14:09:26.231666 + 2025-10-29T14:27:58.771179 image/svg+xml @@ -4916,70 +4916,70 @@ Installed 37 packages in 221ms - + - + - 0.1 + 0.1 - + - + - 0.2 + 0.2 - + - + - 0.3 + 0.3 - + - + - 0.4 + 0.4 - + - + - 0.5 + 0.5 @@ -4987,66 +4987,66 @@ Installed 37 packages in 221ms - + - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + - + - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + diff --git a/flash_attn/impls/artifacts/benchmark/attention.jsonl b/flash_attn/impls/artifacts/benchmark/attention.jsonl index c187be19ed971576faca83871bac5aeb9c24284a..dfaf0c99c533e861b9b0cf0a7d640e38745db1c9 100644 --- a/flash_attn/impls/artifacts/benchmark/attention.jsonl +++ b/flash_attn/impls/artifacts/benchmark/attention.jsonl @@ -1,6 +1,6 @@ -{"ts": "2025-10-28T14:08:53Z", "run": "5a355db38c804430a5f59b9c06c46f52", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.9106109999947876, "p50": 0.9171110000352201, "p90": 0.9204320000435473, "mean": 0.9179216000347878, "iqr": 0.005419999979494605, "raw_times": [0.9171110000352201, 0.9150120000640527, 0.9106109999947876, 0.9204320000435473, 0.9264420000363316], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.9176309999929799, "peak_bytes": 295567360, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null} -{"ts": "2025-10-28T14:08:53Z", "run": "5a355db38c804430a5f59b9c06c46f52", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.9576329999845257, "p50": 0.960063999968952, "p90": 0.9623629999850891, "mean": 0.9611931999643275, "iqr": 0.0033900000744324643, "raw_times": [0.9589729999106567, 0.9576329999845257, 0.960063999968952, 0.9669329999724141, 0.9623629999850891], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.9673530000782193, "peak_bytes": 304742400, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null} -{"ts": "2025-10-28T14:08:53Z", "run": "5a355db38c804430a5f59b9c06c46f52", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0019650000003821, "p50": 1.0193159999971613, "p90": 1.0211459999709405, "mean": 1.015251600006195, "iqr": 0.01198099994326185, "raw_times": [1.0019650000003821, 1.0091650000276786, 1.024666000034813, 1.0193159999971613, 1.0211459999709405], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.009233999980097, "peak_bytes": 307494912, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null} -{"ts": "2025-10-28T14:08:53Z", "run": "5a355db38c804430a5f59b9c06c46f52", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0176959999625979, "p50": 1.0199849999708022, "p90": 1.025695000066662, "mean": 1.0218714000075124, "iqr": 0.006820000066909415, "raw_times": [1.0271060000377474, 1.0176959999625979, 1.0188749999997526, 1.0199849999708022, 1.025695000066662], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.027405000058934, "peak_bytes": 311296000, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null} -{"ts": "2025-10-28T14:08:53Z", "run": "5a355db38c804430a5f59b9c06c46f52", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.1665810000067722, "p50": 1.1845809999613266, "p90": 1.185440999961429, "mean": 1.1787729999923613, "iqr": 0.01419000000169035, "raw_times": [1.1712509999597387, 1.1665810000067722, 1.18601100007254, 1.1845809999613266, 1.185440999961429], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.1787800000320203, "peak_bytes": 315621376, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null} -{"ts": "2025-10-28T14:08:53Z", "run": "5a355db38c804430a5f59b9c06c46f52", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.1722899999995207, "p50": 1.1832310000272628, "p90": 1.1854509999693619, "mean": 1.181276799979969, "iqr": 0.008630000024822948, "raw_times": [1.1885909999591604, 1.1854509999693619, 1.176820999944539, 1.1832310000272628, 1.1722899999995207], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.1782799999764393, "peak_bytes": 319946752, "ok": true, "absmax": 0.125, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.125, "mae": 0.000362396240234375, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null} +{"ts": "2025-10-29T14:27:40Z", "run": "f6f1662cc0064e52be2f08072a7254af", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.98791400000664, "p50": 0.995113999977093, "p90": 1.0003840000081254, "mean": 0.9967803999984426, "iqr": 0.00634899998885885, "raw_times": [0.98791400000664, 0.9940350000192666, 1.006454999981088, 1.0003840000081254, 0.995113999977093], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.0150049999992916, "peak_bytes": 295567360, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null} +{"ts": "2025-10-29T14:27:41Z", "run": "f6f1662cc0064e52be2f08072a7254af", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0233649999804584, "p50": 1.0321449999537435, "p90": 1.0348449999924014, "mean": 1.032277399974646, "iqr": 0.009739000006447895, "raw_times": [1.045925999960673, 1.0251059999859535, 1.0321449999537435, 1.0233649999804584, 1.0348449999924014], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.035865999995167, "peak_bytes": 304742400, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null} +{"ts": "2025-10-29T14:27:41Z", "run": "f6f1662cc0064e52be2f08072a7254af", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0737370000128976, "p50": 1.084086999981082, "p90": 1.088675999994848, "mean": 1.0826705999988917, "iqr": 0.0148400000057336, "raw_times": [1.0738359999891145, 1.0930170000165162, 1.084086999981082, 1.0737370000128976, 1.088675999994848], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.0889670000437945, "peak_bytes": 307494912, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null} +{"ts": "2025-10-29T14:27:41Z", "run": "f6f1662cc0064e52be2f08072a7254af", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0887770000067576, "p50": 1.0916359999555425, "p90": 1.096396999969329, "mean": 1.0932085999797891, "iqr": 0.005600999998023326, "raw_times": [1.0887770000067576, 1.0907959999713057, 1.0916359999555425, 1.0984369999960109, 1.096396999969329], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.1031370000296192, "peak_bytes": 311296000, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null} +{"ts": "2025-10-29T14:27:41Z", "run": "f6f1662cc0064e52be2f08072a7254af", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.2465009999687027, "p50": 1.2523310000460697, "p90": 1.2523909999799798, "mean": 1.2538410000047406, "iqr": 0.005290999979479238, "raw_times": [1.2523310000460697, 1.2523909999799798, 1.2471000000005006, 1.2465009999687027, 1.2708820000284504], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2551809999763464, "peak_bytes": 315621376, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null} +{"ts": "2025-10-29T14:27:41Z", "run": "f6f1662cc0064e52be2f08072a7254af", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.2355700000057368, "p50": 1.241141000036805, "p90": 1.2576200000466997, "mean": 1.2477664000130062, "iqr": 0.02047000003813082, "raw_times": [1.2355700000057368, 1.241141000036805, 1.237150000008569, 1.2576200000466997, 1.2673509999672206], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2579809999806457, "peak_bytes": 319946752, "ok": true, "absmax": 0.125, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.125, "mae": 0.000362396240234375, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null} diff --git a/flash_attn/impls/cells/benchmark.py b/flash_attn/impls/cells/benchmark.py index 15f02e2ed444e10eba9708f3f69247414b6c962b..04ae262009c3d6e33aaa3e392d28c903f24c287c 100644 --- a/flash_attn/impls/cells/benchmark.py +++ b/flash_attn/impls/cells/benchmark.py @@ -4,7 +4,7 @@ # "numpy", # "torch==2.8.0", # "kernels-benchmark-tools", -# "kernels", +# "xformers", # ] # # [tool.uv.sources] @@ -13,19 +13,18 @@ import torch import sys from kernels_benchmark_tools import KernelTypeEnum, run_benchmark -from kernels import get_kernel +import xformers.ops as xops -# Load the flash attention 3 kernel -hf_kernels_flash_attn3 = get_kernel("kernels-community/flash-attn3") - -def hf_flash_attention3(query, key, value): - return hf_kernels_flash_attn3.flash_attn_func(query, key, value, causal=False)[0] +def xformers_attention(q, k, v): + """xFormers memory efficient attention""" + # xFormers expects [batch, seq_len, heads, head_dim] + return xops.memory_efficient_attention(q, k, v) run_benchmark( kernel_type=KernelTypeEnum.ATTENTION, - impl_name="hf_kernels_flash_attn3", - impl_tags={"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, - impl_func=hf_flash_attention3, + impl_name="xformers_meff", + impl_tags={"family": "xformers", "backend": "memory_efficient", "compile": "none"}, + impl_func=xformers_attention, ) \ No newline at end of file diff --git a/flash_attn/impls/flash_attention.html b/flash_attn/impls/flash_attention.html index b4834aa56614f91a384d067a2ab29e14d8abc5f4..a6e50f4eba46389d1f17c35d67cbb770dc3d8952 100644 --- a/flash_attn/impls/flash_attention.html +++ b/flash_attn/impls/flash_attention.html @@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: nv | 0.26s +Cell: nv | 0.28s | Raw @@ -3888,7 +3888,7 @@ Cell: nv | 0.26s
-
Tue Oct 28 14:08:39 2025       
+
Wed Oct 29 14:25:53 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -3897,7 +3897,7 @@ Cell: nv | 0.26s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   32C    P0            153W /  350W |       0MiB /  46068MiB |     26%      Default |
+| N/A   27C    P8             21W /  350W |       0MiB /  46068MiB |      0%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3919,9 +3919,9 @@ Cell: nv | 0.26s
 
 ▼ code 
 ▼ output
- ▶ uv-logs
+ ▶ uv-logs
  | 
-Cell: benchmark | 3.83s
+Cell: benchmark | 32.77s
  | 
 
 Raw
@@ -3972,29 +3972,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L128_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.585ms       101.47%       3.585ms       3.585ms             1  
-                                         torch_flash_ma         6.34%     327.656us        45.53%       2.352ms       2.352ms       0.000us         0.00%       3.573ms       3.573ms             1  
-                     aten::scaled_dot_product_attention         0.82%      42.312us         4.12%     213.057us      71.019us       0.000us         0.00%       2.820ms     940.062us             3  
-              aten::_scaled_dot_product_flash_attention         0.51%      26.321us         3.31%     170.745us      56.915us       0.000us         0.00%       2.820ms     940.062us             3  
-                         aten::_flash_attention_forward         0.73%      37.527us         2.40%     124.015us      41.338us       2.820ms        79.83%       2.820ms     940.062us             3  
-void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       2.820ms        79.83%       2.820ms     940.062us             3  
-                                       aten::contiguous         0.27%      14.121us        33.79%       1.745ms     145.446us       0.000us         0.00%     752.928us      62.744us            12  
-                                            aten::clone         0.72%      37.329us        33.52%       1.731ms     144.269us       0.000us         0.00%     752.928us      62.744us            12  
-                                            aten::copy_         1.68%      87.013us        31.25%       1.614ms     134.513us     712.672us        20.17%     752.928us      62.744us            12  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     712.672us        20.17%     712.672us      59.389us            12  
-                                Activity Buffer Request        27.64%       1.428ms        27.64%       1.428ms       1.428ms      40.256us         1.14%      40.256us      40.256us             1  
-                                        aten::transpose         1.24%      64.087us         1.67%      86.009us       3.584us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.42%      21.922us         0.42%      21.922us       0.913us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.48%      24.711us         1.99%     102.775us       6.852us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.74%      89.843us         1.74%      89.843us       3.743us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         2.38%     122.771us         2.38%     122.771us       8.185us       0.000us         0.00%       0.000us       0.000us            15  
-                                    aten::empty_strided         0.34%      17.310us         0.34%      17.310us       5.770us       0.000us         0.00%       0.000us       0.000us             3  
-                                 cudaDeviceGetAttribute         0.04%       2.229us         0.04%       2.229us       0.372us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.17%       8.900us         0.17%       8.900us       2.967us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        54.47%       2.814ms        54.47%       2.814ms       2.814ms       0.000us         0.00%       0.000us       0.000us             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.644ms       102.02%       3.644ms       3.644ms             1  
+                                         torch_flash_ma         6.80%     356.846us        47.04%       2.468ms       2.468ms       0.000us         0.00%       3.612ms       3.612ms             1  
+                     aten::scaled_dot_product_attention         0.82%      43.042us         4.47%     234.776us      78.259us       0.000us         0.00%       2.857ms     952.201us             3  
+              aten::_scaled_dot_product_flash_attention         0.56%      29.330us         3.65%     191.734us      63.911us       0.000us         0.00%       2.857ms     952.201us             3  
+                         aten::_flash_attention_forward         0.75%      39.581us         2.59%     135.674us      45.225us       2.857ms        79.97%       2.857ms     952.201us             3  
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       2.857ms        79.97%       2.857ms     952.201us             3  
+                                       aten::contiguous         0.27%      14.180us        34.32%       1.801ms     150.051us       0.000us         0.00%     755.680us      62.973us            12  
+                                            aten::clone         0.74%      38.791us        34.04%       1.786ms     148.870us       0.000us         0.00%     755.680us      62.973us            12  
+                                            aten::copy_         1.85%      97.030us        31.43%       1.649ms     137.429us     715.456us        20.03%     755.680us      62.973us            12  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     715.456us        20.03%     715.456us      59.621us            12  
+                                Activity Buffer Request        27.38%       1.437ms        27.38%       1.437ms       1.437ms      40.224us         1.13%      40.224us      40.224us             1  
+                                        aten::transpose         1.47%      77.273us         1.96%     102.714us       4.280us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.48%      25.441us         0.48%      25.441us       1.060us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.70%      36.821us         2.35%     123.326us       8.222us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.93%     101.493us         1.93%     101.493us       4.229us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         2.70%     141.775us         2.70%     141.775us       9.452us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.35%      18.402us         0.35%      18.402us       6.134us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.05%       2.540us         0.05%       2.540us       0.423us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.17%       8.890us         0.17%       8.890us       2.963us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        52.96%       2.779ms        52.96%       2.779ms       2.779ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 5.165ms
-Self CUDA time total: 3.533ms
+Self CPU time total: 5.247ms
+Self CUDA time total: 3.572ms
 
 
 
@@ -4004,29 +4004,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L256_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                         torch_flash_ma         4.84%     255.079us        41.49%       2.188ms       2.188ms       0.000us         0.00%       3.787ms       3.787ms             1  
-                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.743ms       100.29%       3.743ms       3.743ms             1  
-                     aten::scaled_dot_product_attention         0.47%      24.640us         3.42%     180.356us      60.119us       0.000us         0.00%       2.967ms     989.106us             3  
-              aten::_scaled_dot_product_flash_attention         0.36%      19.241us         2.95%     155.716us      51.905us       0.000us         0.00%       2.967ms     989.106us             3  
-                         aten::_flash_attention_forward         0.73%      38.683us         2.19%     115.525us      38.508us       2.967ms        79.51%       2.967ms     989.106us             3  
-void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       2.967ms        79.51%       2.967ms     989.106us             3  
-                                       aten::contiguous         0.17%       8.802us        32.41%       1.709ms     142.425us       0.000us         0.00%     819.868us      68.322us            12  
-                                            aten::clone         0.52%      27.349us        32.24%       1.700ms     141.692us       0.000us         0.00%     819.868us      68.322us            12  
-                                            aten::copy_         1.56%      82.061us        30.60%       1.614ms     134.473us     764.892us        20.49%     819.868us      68.322us            12  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     764.892us        20.49%     764.892us      63.741us            12  
-                                Activity Buffer Request        27.50%       1.450ms        27.50%       1.450ms       1.450ms      54.976us         1.47%      54.976us      54.976us             1  
-                                        aten::transpose         0.91%      47.959us         1.22%      64.512us       2.688us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.31%      16.553us         0.31%      16.553us       0.690us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.39%      20.732us         1.52%      80.304us       5.354us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.38%      72.972us         1.38%      72.972us       3.040us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         1.96%     103.146us         1.96%     103.146us       6.876us       0.000us         0.00%       0.000us       0.000us            15  
-                                    aten::empty_strided         0.28%      14.880us         0.28%      14.880us       4.960us       0.000us         0.00%       0.000us       0.000us             3  
-                                 cudaDeviceGetAttribute         0.03%       1.800us         0.03%       1.800us       0.300us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.07%       3.830us         0.07%       3.830us       1.277us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        58.51%       3.085ms        58.51%       3.085ms       3.085ms       0.000us         0.00%       0.000us       0.000us             1  
+                                         torch_flash_ma         4.70%     246.528us        41.73%       2.189ms       2.189ms       0.000us         0.00%       3.817ms       3.817ms             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.772ms       100.28%       3.772ms       3.772ms             1  
+                     aten::scaled_dot_product_attention         0.51%      26.610us         3.43%     180.143us      60.048us       0.000us         0.00%       2.999ms     999.573us             3  
+              aten::_scaled_dot_product_flash_attention         0.37%      19.600us         2.93%     153.533us      51.178us       0.000us         0.00%       2.999ms     999.573us             3  
+                         aten::_flash_attention_forward         0.63%      32.980us         2.12%     111.443us      37.148us       2.999ms        79.71%       2.999ms     999.573us             3  
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       2.999ms        79.71%       2.999ms     999.573us             3  
+                                       aten::contiguous         0.19%      10.030us        32.68%       1.715ms     142.893us       0.000us         0.00%     818.210us      68.184us            12  
+                                            aten::clone         0.55%      29.002us        32.49%       1.705ms     142.057us       0.000us         0.00%     818.210us      68.184us            12  
+                                            aten::copy_         2.09%     109.441us        30.74%       1.613ms     134.399us     763.297us        20.29%     818.210us      68.184us            12  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     763.297us        20.29%     763.297us      63.608us            12  
+                                Activity Buffer Request        26.94%       1.413ms        26.94%       1.413ms       1.413ms      54.913us         1.46%      54.913us      54.913us             1  
+                                        aten::transpose         1.00%      52.652us         1.34%      70.433us       2.935us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.34%      17.781us         0.34%      17.781us       0.741us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.38%      19.980us         1.61%      84.581us       5.639us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.45%      76.201us         1.45%      76.201us       3.175us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         2.16%     113.102us         2.16%     113.102us       7.540us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.31%      16.430us         0.31%      16.430us       5.477us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.03%       1.751us         0.03%       1.751us       0.292us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.07%       3.771us         0.07%       3.771us       1.257us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        58.27%       3.058ms        58.27%       3.058ms       3.058ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 5.273ms
-Self CUDA time total: 3.732ms
+Self CPU time total: 5.247ms
+Self CUDA time total: 3.762ms
 
 
 
@@ -4036,29 +4036,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L320_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                         torch_flash_ma         4.77%     251.162us        41.45%       2.184ms       2.184ms       0.000us         0.00%       3.786ms       3.786ms             1  
-                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.738ms       100.28%       3.738ms       3.738ms             1  
-                     aten::scaled_dot_product_attention         0.46%      24.280us         3.42%     180.086us      60.029us       0.000us         0.00%       2.949ms     982.872us             3  
-              aten::_scaled_dot_product_flash_attention         0.34%      18.160us         2.96%     155.806us      51.935us       0.000us         0.00%       2.949ms     982.872us             3  
-                         aten::_flash_attention_forward         0.73%      38.599us         2.20%     115.865us      38.622us       2.949ms        79.09%       2.949ms     982.872us             3  
-void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       2.949ms        79.09%       2.949ms     982.872us             3  
-                                       aten::contiguous         0.17%       8.991us        32.44%       1.710ms     142.465us       0.000us         0.00%     837.719us      69.810us            12  
-                                            aten::clone         0.53%      27.728us        32.27%       1.701ms     141.715us       0.000us         0.00%     837.719us      69.810us            12  
-                                            aten::copy_         1.52%      79.873us        30.57%       1.611ms     134.242us     779.480us        20.91%     837.719us      69.810us            12  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     779.480us        20.91%     779.480us      64.957us            12  
-                                Activity Buffer Request        27.50%       1.449ms        27.50%       1.449ms       1.449ms      58.239us         1.56%      58.239us      58.239us             1  
-                                        aten::transpose         0.92%      48.219us         1.24%      65.252us       2.719us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.32%      17.033us         0.32%      17.033us       0.710us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.37%      19.303us         1.55%      81.795us       5.453us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.44%      76.031us         1.44%      76.031us       3.168us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         1.98%     104.564us         1.98%     104.564us       6.971us       0.000us         0.00%       0.000us       0.000us            15  
-                                    aten::empty_strided         0.28%      14.492us         0.28%      14.492us       4.831us       0.000us         0.00%       0.000us       0.000us             3  
-                                 cudaDeviceGetAttribute         0.04%       1.860us         0.04%       1.860us       0.310us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.10%       5.030us         0.10%       5.030us       1.677us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        58.55%       3.085ms        58.55%       3.085ms       3.085ms       0.000us         0.00%       0.000us       0.000us             1  
+                                         torch_flash_ma         4.50%     237.986us        41.18%       2.178ms       2.178ms       0.000us         0.00%       3.833ms       3.833ms             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.785ms       100.29%       3.785ms       3.785ms             1  
+                     aten::scaled_dot_product_attention         0.46%      24.381us         3.40%     179.915us      59.972us       0.000us         0.00%       2.998ms     999.221us             3  
+              aten::_scaled_dot_product_flash_attention         0.36%      19.171us         2.94%     155.534us      51.845us       0.000us         0.00%       2.998ms     999.221us             3  
+                         aten::_flash_attention_forward         0.65%      34.259us         2.15%     113.691us      37.897us       2.998ms        79.44%       2.998ms     999.221us             3  
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       2.998ms        79.44%       2.998ms     999.221us             3  
+                                       aten::contiguous         0.19%       9.800us        32.38%       1.712ms     142.708us       0.000us         0.00%     835.263us      69.605us            12  
+                                            aten::clone         0.53%      28.211us        32.20%       1.703ms     141.891us       0.000us         0.00%     835.263us      69.605us            12  
+                                            aten::copy_         1.60%      84.650us        30.46%       1.611ms     134.247us     776.063us        20.56%     835.263us      69.605us            12  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     776.063us        20.56%     776.063us      64.672us            12  
+                                Activity Buffer Request        27.18%       1.437ms        27.18%       1.437ms       1.437ms      59.200us         1.57%      59.200us      59.200us             1  
+                                        aten::transpose         0.99%      52.225us         1.33%      70.125us       2.922us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.34%      17.900us         0.34%      17.900us       0.746us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.37%      19.782us         1.60%      84.803us       5.654us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.45%      76.431us         1.45%      76.431us       3.185us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         2.16%     114.204us         2.16%     114.204us       7.614us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.30%      16.100us         0.30%      16.100us       5.367us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.03%       1.730us         0.03%       1.730us       0.288us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.07%       3.730us         0.07%       3.730us       1.243us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        58.82%       3.110ms        58.82%       3.110ms       3.110ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 5.269ms
-Self CUDA time total: 3.728ms
+Self CPU time total: 5.288ms
+Self CUDA time total: 3.774ms
 
 
 
@@ -4068,29 +4068,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L384_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                         torch_flash_ma         5.01%     280.573us        44.17%       2.475ms       2.475ms       0.000us         0.00%       3.878ms       3.878ms             1  
-                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.831ms       100.27%       3.831ms       3.831ms             1  
-                     aten::scaled_dot_product_attention         0.48%      26.630us         3.39%     189.956us      63.319us       0.000us         0.00%       3.032ms       1.011ms             3  
-              aten::_scaled_dot_product_flash_attention         0.34%      19.101us         2.91%     163.326us      54.442us       0.000us         0.00%       3.032ms       1.011ms             3  
-                         aten::_flash_attention_forward         0.70%      39.063us         2.15%     120.325us      40.108us       3.032ms        79.37%       3.032ms       1.011ms             3  
-void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.032ms        79.37%       3.032ms       1.011ms             3  
-                                       aten::contiguous         0.17%       9.271us        34.98%       1.960ms     163.354us       0.000us         0.00%     845.820us      70.485us            12  
-                                            aten::clone         0.52%      28.974us        34.82%       1.951ms     162.581us       0.000us         0.00%     845.820us      70.485us            12  
-                                            aten::copy_         1.48%      83.180us        33.17%       1.859ms     154.908us     788.284us        20.63%     845.820us      70.485us            12  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     788.284us        20.63%     788.284us      65.690us            12  
-                                Activity Buffer Request        26.18%       1.467ms        26.18%       1.467ms       1.467ms      57.536us         1.51%      57.536us      57.536us             1  
-                                        aten::transpose         0.89%      50.110us         1.21%      67.952us       2.831us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.32%      17.842us         0.32%      17.842us       0.743us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.36%      19.969us         1.53%      85.492us       5.699us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.37%      76.982us         1.37%      76.982us       3.208us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         5.95%     333.480us         5.95%     333.480us      22.232us       0.000us         0.00%       0.000us       0.000us            15  
-                                    aten::empty_strided         0.30%      17.041us         0.30%      17.041us       5.680us       0.000us         0.00%       0.000us       0.000us             3  
-                                 cudaDeviceGetAttribute         0.03%       1.700us         0.03%       1.700us       0.283us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.07%       4.040us         0.07%       4.040us       1.347us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        55.83%       3.129ms        55.83%       3.129ms       3.129ms       0.000us         0.00%       0.000us       0.000us             1  
+                                         torch_flash_ma         4.36%     241.837us        43.33%       2.405ms       2.405ms       0.000us         0.00%       3.884ms       3.884ms             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.837ms       100.27%       3.837ms       3.837ms             1  
+                     aten::scaled_dot_product_attention         0.48%      26.802us         3.27%     181.715us      60.572us       0.000us         0.00%       3.042ms       1.014ms             3  
+              aten::_scaled_dot_product_flash_attention         0.35%      19.308us         2.79%     154.913us      51.638us       0.000us         0.00%       3.042ms       1.014ms             3  
+                         aten::_flash_attention_forward         0.60%      33.361us         2.03%     112.712us      37.571us       3.042ms        79.50%       3.042ms       1.014ms             3  
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.042ms        79.50%       3.042ms       1.014ms             3  
+                                       aten::contiguous         0.17%       9.659us        34.84%       1.934ms     161.162us       0.000us         0.00%     841.829us      70.152us            12  
+                                            aten::clone         0.50%      27.830us        34.67%       1.924ms     160.357us       0.000us         0.00%     841.829us      70.152us            12  
+                                            aten::copy_         1.56%      86.702us        32.55%       1.807ms     150.547us     784.548us        20.50%     841.829us      70.152us            12  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     784.548us        20.50%     784.548us      65.379us            12  
+                                Activity Buffer Request        25.45%       1.413ms        25.45%       1.413ms       1.413ms      57.281us         1.50%      57.281us      57.281us             1  
+                                        aten::transpose         0.95%      52.620us         1.27%      70.404us       2.933us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.32%      17.784us         0.32%      17.784us       0.741us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.78%      43.221us         2.00%     111.194us       7.413us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.45%      80.673us         1.45%      80.673us       3.361us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         5.96%     331.078us         5.96%     331.078us      22.072us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.28%      15.800us         0.28%      15.800us       5.267us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.03%       1.730us         0.03%       1.730us       0.288us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.07%       3.850us         0.07%       3.850us       1.283us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        56.67%       3.146ms        56.67%       3.146ms       3.146ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 5.603ms
-Self CUDA time total: 3.820ms
+Self CPU time total: 5.551ms
+Self CUDA time total: 3.827ms
 
 
 
@@ -4100,29 +4100,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L448_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                         torch_flash_ma         5.07%     303.893us        39.93%       2.395ms       2.395ms       0.000us         0.00%       4.370ms       4.370ms             1  
-                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       4.320ms       100.25%       4.320ms       4.320ms             1  
-                     aten::scaled_dot_product_attention         0.41%      24.650us         3.07%     184.006us      61.335us       0.000us         0.00%       3.503ms       1.168ms             3  
-              aten::_scaled_dot_product_flash_attention         0.32%      19.311us         2.66%     159.356us      53.119us       0.000us         0.00%       3.503ms       1.168ms             3  
-                         aten::_flash_attention_forward         0.68%      40.911us         1.97%     118.205us      39.402us       3.503ms        81.28%       3.503ms       1.168ms             3  
-void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.503ms        81.28%       3.503ms       1.168ms             3  
-                                       aten::contiguous         0.15%       8.977us        31.04%       1.862ms     155.201us       0.000us         0.00%     867.581us      72.298us            12  
-                                            aten::clone         0.47%      28.114us        30.89%       1.853ms     154.453us       0.000us         0.00%     867.581us      72.298us            12  
-                                            aten::copy_         1.36%      81.500us        29.40%       1.764ms     146.991us     806.749us        18.72%     867.581us      72.298us            12  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     806.749us        18.72%     806.749us      67.229us            12  
-                                Activity Buffer Request        23.82%       1.429ms        23.82%       1.429ms       1.429ms      60.832us         1.41%      60.832us      60.832us             1  
-                                        aten::transpose         0.82%      49.363us         1.11%      66.863us       2.786us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.29%      17.500us         0.29%      17.500us       0.729us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.33%      20.081us         1.37%      82.424us       5.495us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.26%      75.593us         1.26%      75.593us       3.150us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         4.60%     275.759us         4.60%     275.759us      18.384us       0.000us         0.00%       0.000us       0.000us            15  
-                                    aten::empty_strided         0.25%      15.251us         0.25%      15.251us       5.084us       0.000us         0.00%       0.000us       0.000us             3  
-                                 cudaDeviceGetAttribute         0.03%       1.740us         0.03%       1.740us       0.290us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.06%       3.680us         0.06%       3.680us       1.227us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        60.07%       3.604ms        60.07%       3.604ms       3.604ms       0.000us         0.00%       0.000us       0.000us             1  
+                                         torch_flash_ma         4.46%     268.165us        40.09%       2.413ms       2.413ms       0.000us         0.00%       4.405ms       4.405ms             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       4.355ms       100.25%       4.355ms       4.355ms             1  
+                     aten::scaled_dot_product_attention         0.46%      27.642us         3.64%     218.806us      72.935us       0.000us         0.00%       3.540ms       1.180ms             3  
+              aten::_scaled_dot_product_flash_attention         0.75%      45.250us         3.18%     191.164us      63.721us       0.000us         0.00%       3.540ms       1.180ms             3  
+                         aten::_flash_attention_forward         0.61%      36.651us         2.01%     120.923us      40.308us       3.540ms        81.48%       3.540ms       1.180ms             3  
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.540ms        81.48%       3.540ms       1.180ms             3  
+                                       aten::contiguous         0.18%      10.862us        31.11%       1.873ms     156.050us       0.000us         0.00%     865.606us      72.134us            12  
+                                            aten::clone         0.51%      30.490us        30.93%       1.862ms     155.145us       0.000us         0.00%     865.606us      72.134us            12  
+                                            aten::copy_         1.51%      90.931us        29.34%       1.766ms     147.155us     804.645us        18.52%     865.606us      72.134us            12  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     804.645us        18.52%     804.645us      67.054us            12  
+                                Activity Buffer Request        21.61%       1.300ms        21.61%       1.300ms       1.300ms      60.961us         1.40%      60.961us      60.961us             1  
+                                        aten::transpose         0.99%      59.753us         1.30%      78.501us       3.271us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.31%      18.748us         0.31%      18.748us       0.781us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.35%      20.935us         1.45%      87.165us       5.811us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.32%      79.690us         1.32%      79.690us       3.320us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         6.67%     401.680us         6.67%     401.680us      26.779us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.27%      16.081us         0.27%      16.081us       5.360us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.03%       2.030us         0.03%       2.030us       0.338us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.06%       3.810us         0.06%       3.810us       1.270us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        59.91%       3.605ms        59.91%       3.605ms       3.605ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 5.999ms
-Self CUDA time total: 4.309ms
+Self CPU time total: 6.018ms
+Self CUDA time total: 4.344ms
 
 
 
@@ -4132,39 +4132,91 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L512_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                         torch_flash_ma         3.83%     232.270us        37.82%       2.296ms       2.296ms       0.000us         0.00%       4.474ms       4.474ms             1  
-                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       4.423ms       100.25%       4.423ms       4.423ms             1  
-                     aten::scaled_dot_product_attention         0.41%      24.850us         2.85%     172.746us      57.582us       0.000us         0.00%       3.595ms       1.198ms             3  
-              aten::_scaled_dot_product_flash_attention         0.30%      18.250us         2.44%     147.896us      49.299us       0.000us         0.00%       3.595ms       1.198ms             3  
-                         aten::_flash_attention_forward         0.54%      32.692us         1.77%     107.224us      35.741us       3.595ms        81.48%       3.595ms       1.198ms             3  
-void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.595ms        81.48%       3.595ms       1.198ms             3  
-                                       aten::contiguous         0.14%       8.610us        30.41%       1.846ms     153.859us       0.000us         0.00%     878.139us      73.178us            12  
-                                            aten::clone         0.45%      27.368us        30.27%       1.838ms     153.142us       0.000us         0.00%     878.139us      73.178us            12  
-                                            aten::copy_         1.35%      81.917us        28.83%       1.750ms     145.831us     817.083us        18.52%     878.139us      73.178us            12  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     817.083us        18.52%     817.083us      68.090us            12  
-                                Activity Buffer Request        23.72%       1.440ms        23.72%       1.440ms       1.440ms      61.056us         1.38%      61.056us      61.056us             1  
-                                        aten::transpose         0.82%      50.064us         1.10%      66.792us       2.783us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.28%      16.728us         0.28%      16.728us       0.697us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.32%      19.431us         1.31%      79.591us       5.306us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.21%      73.220us         1.21%      73.220us       3.051us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         4.12%     249.950us         4.12%     249.950us      16.663us       0.000us         0.00%       0.000us       0.000us            15  
-                                    aten::empty_strided         0.24%      14.270us         0.24%      14.270us       4.757us       0.000us         0.00%       0.000us       0.000us             3  
-                                 cudaDeviceGetAttribute         0.03%       1.680us         0.03%       1.680us       0.280us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.07%       4.380us         0.07%       4.380us       1.460us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        62.18%       3.775ms        62.18%       3.775ms       3.775ms       0.000us         0.00%       0.000us       0.000us             1  
+                                         torch_flash_ma         4.01%     246.839us        39.75%       2.447ms       2.447ms       0.000us         0.00%       4.458ms       4.458ms             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       4.407ms       100.23%       4.407ms       4.407ms             1  
+                     aten::scaled_dot_product_attention         0.40%      24.621us         2.95%     181.474us      60.491us       0.000us         0.00%       3.579ms       1.193ms             3  
+              aten::_scaled_dot_product_flash_attention         0.34%      20.980us         2.55%     156.853us      52.284us       0.000us         0.00%       3.579ms       1.193ms             3  
+                         aten::_flash_attention_forward         0.58%      35.588us         1.84%     113.003us      37.668us       3.579ms        81.40%       3.579ms       1.193ms             3  
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.579ms        81.40%       3.579ms       1.193ms             3  
+                                       aten::contiguous         0.16%      10.061us        32.01%       1.971ms     164.244us       0.000us         0.00%     878.818us      73.235us            12  
+                                            aten::clone         0.50%      30.903us        31.85%       1.961ms     163.406us       0.000us         0.00%     878.818us      73.235us            12  
+                                            aten::copy_         1.35%      82.841us        30.27%       1.864ms     155.305us     817.634us        18.60%     878.818us      73.235us            12  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     817.634us        18.60%     817.634us      68.136us            12  
+                                Activity Buffer Request        23.50%       1.447ms        23.50%       1.447ms       1.447ms      61.184us         1.39%      61.184us      61.184us             1  
+                                        aten::transpose         0.85%      52.630us         1.15%      70.790us       2.950us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.29%      18.160us         0.29%      18.160us       0.757us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.33%      20.456us         1.41%      86.700us       5.780us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.28%      78.794us         1.28%      78.794us       3.283us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         5.81%     357.919us         5.81%     357.919us      23.861us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.25%      15.401us         0.25%      15.401us       5.134us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.03%       1.632us         0.03%       1.632us       0.272us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.06%       3.720us         0.06%       3.720us       1.240us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        60.25%       3.709ms        60.25%       3.709ms       3.709ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 6.071ms
-Self CUDA time total: 4.413ms
+Self CPU time total: 6.156ms
+Self CUDA time total: 4.397ms
 
 
 impl                     wl                  p50(ms)  ok
 torch_flash_ma           cuda_attn_L128_bfloat16     1.22  True
-torch_flash_ma           cuda_attn_L256_bfloat16     1.27  True
-torch_flash_ma           cuda_attn_L320_bfloat16     1.28  True
-torch_flash_ma           cuda_attn_L384_bfloat16     1.31  True
+torch_flash_ma           cuda_attn_L256_bfloat16     1.28  True
+torch_flash_ma           cuda_attn_L320_bfloat16     1.29  True
+torch_flash_ma           cuda_attn_L384_bfloat16     1.33  True
 torch_flash_ma           cuda_attn_L448_bfloat16     1.47  True
 torch_flash_ma           cuda_attn_L512_bfloat16     1.50  True
 
+
+
▶ UV Install Logs
+ +

Artifacts:

attention.jsonl diff --git a/flash_attn/impls/hf_kernels_flash_attn.html b/flash_attn/impls/hf_kernels_flash_attn.html index 6414b268459e56cf2a96ef4b229b35fde2e104fa..7d03567858952d02de89e25ce04873ef34373a75 100644 --- a/flash_attn/impls/hf_kernels_flash_attn.html +++ b/flash_attn/impls/hf_kernels_flash_attn.html @@ -3869,9 +3869,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ code ▼ output - ▶ uv-logs + ▶ uv-logs | -Cell: benchmark | 6.08s +Cell: benchmark | 5.58s | Raw @@ -3926,21 +3926,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L128_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn 3.64% 160.058us 41.50% 1.823ms 1.823ms 0.000us 0.00% 3.744ms 3.744ms 1 - _flash_attn_9e27194::fwd 1.78% 78.347us 37.86% 1.663ms 554.208us 2.792ms 100.00% 3.744ms 1.248ms 3 - hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.794ms 100.05% 2.794ms 2.794ms 1 -void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.792ms 100.00% 2.792ms 930.800us 3 - Activity Buffer Request 33.00% 1.449ms 33.00% 1.449ms 1.449ms 951.685us 34.08% 951.685us 951.685us 1 - cudaDeviceGetAttribute 0.13% 5.638us 0.13% 5.638us 0.376us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty_like 0.40% 17.551us 1.19% 52.122us 17.374us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 0.79% 34.571us 0.79% 34.571us 11.524us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty 0.57% 24.890us 0.57% 24.890us 2.766us 0.000us 0.00% 0.000us 0.000us 9 - cudaFuncSetAttribute 0.28% 12.210us 0.28% 12.210us 4.070us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 0.92% 40.292us 0.92% 40.292us 13.431us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 58.50% 2.569ms 58.50% 2.569ms 2.569ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn 3.55% 156.153us 41.08% 1.807ms 1.807ms 0.000us 0.00% 3.775ms 3.775ms 1 + _flash_attn_9e27194::fwd 1.65% 72.542us 37.53% 1.651ms 550.240us 2.812ms 100.00% 3.775ms 1.258ms 3 + hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.814ms 100.05% 2.814ms 2.814ms 1 +void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.812ms 100.00% 2.812ms 937.398us 3 + Activity Buffer Request 32.22% 1.417ms 32.22% 1.417ms 1.417ms 962.880us 34.24% 962.880us 962.880us 1 + cudaDeviceGetAttribute 0.13% 5.500us 0.13% 5.500us 0.367us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty_like 0.43% 19.110us 1.25% 54.882us 18.294us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 0.81% 35.772us 0.81% 35.772us 11.924us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty 0.57% 25.101us 0.57% 25.101us 2.789us 0.000us 0.00% 0.000us 0.000us 9 + cudaFuncSetAttribute 0.30% 13.270us 0.30% 13.270us 4.423us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 1.42% 62.402us 1.42% 62.402us 20.801us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 58.92% 2.591ms 58.92% 2.591ms 2.591ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.392ms -Self CUDA time total: 2.792ms +Self CPU time total: 4.398ms +Self CUDA time total: 2.812ms @@ -3950,21 +3950,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L256_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn 2.22% 99.144us 37.48% 1.673ms 1.673ms 0.000us 0.00% 3.949ms 3.949ms 1 - _flash_attn_9e27194::fwd 1.20% 53.462us 35.26% 1.574ms 524.654us 2.953ms 100.00% 3.949ms 1.316ms 3 - hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.955ms 100.05% 2.955ms 2.955ms 1 -void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.953ms 100.00% 2.953ms 984.436us 3 - Activity Buffer Request 32.23% 1.439ms 32.23% 1.439ms 1.439ms 995.807us 33.72% 995.807us 995.807us 1 - cudaDeviceGetAttribute 0.10% 4.621us 0.10% 4.621us 0.308us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty_like 0.17% 7.710us 0.56% 24.861us 8.287us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 0.38% 17.151us 0.38% 17.151us 5.717us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty 0.47% 21.122us 0.47% 21.122us 2.347us 0.000us 0.00% 0.000us 0.000us 9 - cudaFuncSetAttribute 0.08% 3.791us 0.08% 3.791us 1.264us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 0.61% 27.380us 0.61% 27.380us 9.127us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 62.52% 2.791ms 62.52% 2.791ms 2.791ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn 2.04% 91.192us 36.62% 1.634ms 1.634ms 0.000us 0.00% 3.983ms 3.983ms 1 + _flash_attn_9e27194::fwd 1.11% 49.718us 34.57% 1.543ms 514.203us 2.978ms 100.00% 3.983ms 1.328ms 3 + hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.980ms 100.05% 2.980ms 2.980ms 1 +void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.978ms 100.00% 2.978ms 992.707us 3 + Activity Buffer Request 31.74% 1.416ms 31.74% 1.416ms 1.416ms 1.004ms 33.73% 1.004ms 1.004ms 1 + cudaDeviceGetAttribute 0.08% 3.711us 0.08% 3.711us 0.247us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty_like 0.17% 7.481us 0.51% 22.841us 7.614us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 0.34% 15.360us 0.34% 15.360us 5.120us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty 0.46% 20.620us 0.46% 20.620us 2.291us 0.000us 0.00% 0.000us 0.000us 9 + cudaFuncSetAttribute 0.08% 3.741us 0.08% 3.741us 1.247us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.58% 25.842us 0.58% 25.842us 8.614us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 63.38% 2.828ms 63.38% 2.828ms 2.828ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.464ms -Self CUDA time total: 2.953ms +Self CPU time total: 4.462ms +Self CUDA time total: 2.978ms @@ -3974,21 +3974,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L320_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn 2.58% 116.955us 37.54% 1.702ms 1.702ms 0.000us 0.00% 4.041ms 4.041ms 1 - _flash_attn_9e27194::fwd 1.53% 69.255us 34.96% 1.585ms 528.314us 3.010ms 100.00% 4.041ms 1.347ms 3 - hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.012ms 100.05% 3.012ms 3.012ms 1 -void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.010ms 100.00% 3.010ms 1.003ms 3 - Activity Buffer Request 31.53% 1.430ms 31.53% 1.430ms 1.430ms 1.031ms 34.26% 1.031ms 1.031ms 1 - cudaDeviceGetAttribute 0.10% 4.450us 0.10% 4.450us 0.297us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty_like 0.18% 8.151us 0.57% 25.801us 8.600us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 0.39% 17.650us 0.39% 17.650us 5.883us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty 0.48% 21.771us 0.48% 21.771us 2.419us 0.000us 0.00% 0.000us 0.000us 9 - cudaFuncSetAttribute 0.10% 4.360us 0.10% 4.360us 1.453us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 0.66% 29.790us 0.66% 29.790us 9.930us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 62.46% 2.832ms 62.46% 2.832ms 2.832ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn 2.28% 105.284us 36.17% 1.673ms 1.673ms 0.000us 0.00% 4.145ms 4.145ms 1 + _flash_attn_9e27194::fwd 1.09% 50.271us 33.89% 1.567ms 522.459us 3.096ms 100.00% 4.145ms 1.382ms 3 + hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.098ms 100.05% 3.098ms 3.098ms 1 +void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.096ms 100.00% 3.096ms 1.032ms 3 + Activity Buffer Request 31.08% 1.437ms 31.08% 1.437ms 1.437ms 1.049ms 33.87% 1.049ms 1.049ms 1 + cudaDeviceGetAttribute 0.08% 3.850us 0.08% 3.850us 0.257us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty_like 0.15% 7.061us 0.49% 22.631us 7.544us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 0.34% 15.570us 0.34% 15.570us 5.190us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty 0.47% 21.760us 0.47% 21.760us 2.418us 0.000us 0.00% 0.000us 0.000us 9 + cudaFuncSetAttribute 0.08% 3.689us 0.08% 3.689us 1.230us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.61% 27.992us 0.61% 27.992us 9.331us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 63.83% 2.952ms 63.83% 2.952ms 2.952ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.534ms -Self CUDA time total: 3.010ms +Self CPU time total: 4.625ms +Self CUDA time total: 3.096ms @@ -3998,21 +3998,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L384_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn 2.39% 114.805us 40.03% 1.925ms 1.925ms 0.000us 0.00% 4.094ms 4.094ms 1 - _flash_attn_9e27194::fwd 1.09% 52.653us 37.65% 1.810ms 603.407us 3.063ms 100.00% 4.094ms 1.365ms 3 - hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.065ms 100.05% 3.065ms 3.065ms 1 -void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.063ms 100.00% 3.063ms 1.021ms 3 - Activity Buffer Request 29.78% 1.432ms 29.78% 1.432ms 1.432ms 1.031ms 33.65% 1.031ms 1.031ms 1 - cudaDeviceGetAttribute 0.10% 4.861us 0.10% 4.861us 0.324us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty_like 0.16% 7.720us 0.55% 26.331us 8.777us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 0.39% 18.611us 0.39% 18.611us 6.204us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty 0.45% 21.731us 0.45% 21.731us 2.415us 0.000us 0.00% 0.000us 0.000us 9 - cudaFuncSetAttribute 0.08% 3.728us 0.08% 3.728us 1.243us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 5.59% 268.862us 5.59% 268.862us 89.621us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 59.97% 2.884ms 59.97% 2.884ms 2.884ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn 2.30% 110.882us 38.29% 1.842ms 1.842ms 0.000us 0.00% 4.161ms 4.161ms 1 + _flash_attn_9e27194::fwd 1.05% 50.321us 35.98% 1.731ms 577.014us 3.117ms 100.00% 4.161ms 1.387ms 3 + hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.118ms 100.05% 3.118ms 3.118ms 1 +void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.117ms 100.00% 3.117ms 1.039ms 3 + Activity Buffer Request 29.64% 1.426ms 29.64% 1.426ms 1.426ms 1.044ms 33.50% 1.044ms 1.044ms 1 + cudaDeviceGetAttribute 0.08% 3.780us 0.08% 3.780us 0.252us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty_like 0.15% 7.259us 0.50% 24.240us 8.080us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 0.35% 16.981us 0.35% 16.981us 5.660us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty 0.45% 21.602us 0.45% 21.602us 2.400us 0.000us 0.00% 0.000us 0.000us 9 + cudaFuncSetAttribute 0.08% 3.770us 0.08% 3.770us 1.257us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 4.18% 201.205us 4.18% 201.205us 67.068us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 61.71% 2.969ms 61.71% 2.969ms 2.969ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.809ms -Self CUDA time total: 3.063ms +Self CPU time total: 4.811ms +Self CUDA time total: 3.117ms @@ -4022,21 +4022,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L448_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn 2.13% 113.755us 35.84% 1.918ms 1.918ms 0.000us 0.00% 4.786ms 4.786ms 1 - _flash_attn_9e27194::fwd 1.02% 54.483us 33.71% 1.804ms 601.364us 3.588ms 100.00% 4.786ms 1.595ms 3 - hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.590ms 100.04% 3.590ms 3.590ms 1 -void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.588ms 100.00% 3.588ms 1.196ms 3 - Activity Buffer Request 26.99% 1.445ms 26.99% 1.445ms 1.445ms 1.198ms 33.38% 1.198ms 1.198ms 1 - cudaDeviceGetAttribute 0.08% 4.270us 0.08% 4.270us 0.285us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty_like 0.15% 8.039us 0.48% 25.640us 8.547us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 0.33% 17.601us 0.33% 17.601us 5.867us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty 0.40% 21.582us 0.40% 21.582us 2.398us 0.000us 0.00% 0.000us 0.000us 9 - cudaFuncSetAttribute 0.07% 3.700us 0.07% 3.700us 1.233us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 4.67% 249.891us 4.67% 249.891us 83.297us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 64.16% 3.434ms 64.16% 3.434ms 3.434ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn 2.05% 108.443us 34.64% 1.832ms 1.832ms 0.000us 0.00% 4.810ms 4.810ms 1 + _flash_attn_9e27194::fwd 0.96% 50.812us 32.59% 1.723ms 574.364us 3.602ms 100.00% 4.810ms 1.603ms 3 + hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.603ms 100.04% 3.603ms 3.603ms 1 +void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.602ms 100.00% 3.602ms 1.201ms 3 + Activity Buffer Request 27.53% 1.455ms 27.53% 1.455ms 1.455ms 1.209ms 33.55% 1.209ms 1.209ms 1 + cudaDeviceGetAttribute 0.08% 4.070us 0.08% 4.070us 0.271us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty_like 0.14% 7.390us 0.45% 23.900us 7.967us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 0.31% 16.510us 0.31% 16.510us 5.503us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty 0.40% 21.151us 0.40% 21.151us 2.350us 0.000us 0.00% 0.000us 0.000us 9 + cudaFuncSetAttribute 0.07% 3.770us 0.07% 3.770us 1.257us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 3.10% 164.023us 3.10% 164.023us 54.674us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 65.36% 3.455ms 65.36% 3.455ms 3.455ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 5.351ms -Self CUDA time total: 3.588ms +Self CPU time total: 5.287ms +Self CUDA time total: 3.602ms @@ -4046,41 +4046,36 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L512_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn 2.08% 111.044us 35.25% 1.879ms 1.879ms 0.000us 0.00% 4.816ms 4.816ms 1 - _flash_attn_9e27194::fwd 0.99% 52.834us 33.17% 1.768ms 589.427us 3.606ms 100.00% 4.816ms 1.605ms 3 - hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.607ms 100.05% 3.607ms 3.607ms 1 -void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.606ms 100.00% 3.606ms 1.202ms 3 - Activity Buffer Request 26.56% 1.416ms 26.56% 1.416ms 1.416ms 1.210ms 33.55% 1.210ms 1.210ms 1 - cudaDeviceGetAttribute 0.08% 4.460us 0.08% 4.460us 0.297us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty_like 0.14% 7.500us 0.49% 26.051us 8.684us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 0.35% 18.551us 0.35% 18.551us 6.184us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty 0.41% 21.960us 0.41% 21.960us 2.440us 0.000us 0.00% 0.000us 0.000us 9 - cudaFuncSetAttribute 0.08% 4.009us 0.08% 4.009us 1.336us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 4.55% 242.792us 4.55% 242.792us 80.931us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 64.75% 3.452ms 64.75% 3.452ms 3.452ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn 1.95% 105.103us 34.11% 1.836ms 1.836ms 0.000us 0.00% 4.931ms 4.931ms 1 + _flash_attn_9e27194::fwd 1.08% 58.141us 32.16% 1.731ms 577.087us 3.693ms 100.00% 4.931ms 1.644ms 3 + hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.695ms 100.04% 3.695ms 3.695ms 1 +void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.693ms 100.00% 3.693ms 1.231ms 3 + Activity Buffer Request 26.71% 1.438ms 26.71% 1.438ms 1.438ms 1.238ms 33.53% 1.238ms 1.238ms 1 + cudaDeviceGetAttribute 0.08% 4.380us 0.08% 4.380us 0.292us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty_like 0.15% 8.230us 0.50% 26.750us 8.917us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 0.34% 18.520us 0.34% 18.520us 6.173us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty 0.48% 25.961us 0.48% 25.961us 2.885us 0.000us 0.00% 0.000us 0.000us 9 + cudaFuncSetAttribute 0.08% 4.220us 0.08% 4.220us 1.407us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 3.23% 173.714us 3.23% 173.714us 57.905us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 65.89% 3.548ms 65.89% 3.548ms 3.548ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 5.332ms -Self CUDA time total: 3.606ms +Self CPU time total: 5.384ms +Self CUDA time total: 3.693ms impl wl p50(ms) ok -hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.96 True +hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.97 True hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.01 True hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.06 True -hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.05 True -hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.22 True -hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.21 True +hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.09 True +hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.24 True +hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.23 True
-
-
▶ UV Install Logs
- -
Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s] -Fetching 20 files: 5%|▌ | 1/20 [00:00<00:04, 4.26it/s] -Fetching 20 files: 10%|█ | 2/20 [00:01<00:17, 1.03it/s] -Fetching 20 files: 100%|██████████| 20/20 [00:01<00:00, 11.64it/s]

Artifacts:

attention.jsonl diff --git a/flash_attn/impls/hf_kernels_flash_attn3.html b/flash_attn/impls/hf_kernels_flash_attn3.html index 3b19dae40ca718ad81f3050d7a0de99c655bf943..889bda3eb9ecfa28e1bd79f67d85d1acc88d58a0 100644 --- a/flash_attn/impls/hf_kernels_flash_attn3.html +++ b/flash_attn/impls/hf_kernels_flash_attn3.html @@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: benchmark | 5.68s +Cell: benchmark | 5.52s | Raw @@ -3925,19 +3925,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L128_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn3 3.89% 167.076us 44.49% 1.911ms 1.911ms 0.000us 0.00% 3.576ms 3.576ms 1 - FlashAttnFunc 3.00% 128.934us 40.60% 1.744ms 581.290us 0.000us 0.00% 3.576ms 1.192ms 3 - _flash_attn3_48fe103_dirty::fwd 1.82% 78.184us 37.60% 1.615ms 538.312us 2.688ms 100.00% 3.576ms 1.192ms 3 - hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.690ms 100.05% 2.690ms 2.690ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.688ms 100.00% 2.688ms 896.117us 3 - Activity Buffer Request 33.29% 1.430ms 33.29% 1.430ms 1.430ms 887.327us 33.01% 887.327us 887.327us 1 - aten::empty 1.08% 46.281us 1.08% 46.281us 7.714us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.37% 15.900us 0.37% 15.900us 5.300us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 1.04% 44.671us 1.04% 44.671us 14.890us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 55.51% 2.384ms 55.51% 2.384ms 2.384ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn3 3.72% 161.222us 44.67% 1.935ms 1.935ms 0.000us 0.00% 3.599ms 3.599ms 1 + FlashAttnFunc 2.81% 121.834us 40.95% 1.774ms 591.218us 0.000us 0.00% 3.599ms 1.200ms 3 + _flash_attn3_48fe103_dirty::fwd 1.85% 79.992us 38.14% 1.652ms 550.607us 2.693ms 100.00% 3.599ms 1.200ms 3 + hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.695ms 100.05% 2.695ms 2.695ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.693ms 100.00% 2.693ms 897.759us 3 + Activity Buffer Request 33.93% 1.470ms 33.93% 1.470ms 1.470ms 905.439us 33.62% 905.439us 905.439us 1 + aten::empty 1.00% 43.311us 1.00% 43.311us 7.219us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.32% 13.891us 0.32% 13.891us 4.630us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 1.04% 45.121us 1.04% 45.121us 15.040us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 55.33% 2.396ms 55.33% 2.396ms 2.396ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.295ms -Self CUDA time total: 2.688ms +Self CPU time total: 4.331ms +Self CUDA time total: 2.693ms @@ -3947,19 +3947,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L256_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn3 3.06% 130.754us 41.10% 1.758ms 1.758ms 0.000us 0.00% 3.668ms 3.668ms 1 - FlashAttnFunc 2.23% 95.572us 38.05% 1.627ms 542.455us 0.000us 0.00% 3.668ms 1.223ms 3 - _flash_attn3_48fe103_dirty::fwd 1.23% 52.754us 35.81% 1.532ms 510.598us 2.747ms 100.00% 3.668ms 1.223ms 3 - hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.748ms 100.05% 2.748ms 2.748ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.747ms 100.00% 2.747ms 915.501us 3 - Activity Buffer Request 33.10% 1.416ms 33.10% 1.416ms 1.416ms 921.272us 33.54% 921.272us 921.272us 1 - aten::empty 0.63% 26.890us 0.63% 26.890us 4.482us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.12% 4.970us 0.12% 4.970us 1.657us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 0.73% 31.351us 0.73% 31.351us 10.450us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 58.90% 2.519ms 58.90% 2.519ms 2.519ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn3 2.17% 96.772us 39.76% 1.770ms 1.770ms 0.000us 0.00% 3.876ms 3.876ms 1 + FlashAttnFunc 2.04% 90.694us 37.59% 1.674ms 557.834us 0.000us 0.00% 3.876ms 1.292ms 3 + _flash_attn3_48fe103_dirty::fwd 1.15% 51.142us 35.55% 1.583ms 527.603us 2.896ms 100.00% 3.876ms 1.292ms 3 + hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.898ms 100.05% 2.898ms 2.898ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.896ms 100.00% 2.896ms 965.387us 3 + Activity Buffer Request 33.04% 1.471ms 33.04% 1.471ms 1.471ms 979.809us 33.83% 979.809us 979.809us 1 + aten::empty 0.58% 25.610us 0.58% 25.610us 4.268us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.12% 5.240us 0.12% 5.240us 1.747us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.67% 29.750us 0.67% 29.750us 9.917us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 60.24% 2.682ms 60.24% 2.682ms 2.682ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.277ms -Self CUDA time total: 2.747ms +Self CPU time total: 4.452ms +Self CUDA time total: 2.896ms @@ -3969,19 +3969,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L320_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn3 2.33% 101.653us 39.53% 1.727ms 1.727ms 0.000us 0.00% 3.829ms 3.829ms 1 - FlashAttnFunc 2.05% 89.593us 37.20% 1.625ms 541.619us 0.000us 0.00% 3.829ms 1.276ms 3 - _flash_attn3_48fe103_dirty::fwd 1.17% 51.051us 35.15% 1.535ms 511.754us 2.856ms 100.00% 3.829ms 1.276ms 3 - hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.858ms 100.06% 2.858ms 2.858ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.856ms 100.00% 2.856ms 952.136us 3 - Activity Buffer Request 32.54% 1.421ms 32.54% 1.421ms 1.421ms 972.574us 34.05% 972.574us 972.574us 1 - aten::empty 0.62% 27.231us 0.62% 27.231us 4.538us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.12% 5.411us 0.12% 5.411us 1.804us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 0.69% 30.341us 0.69% 30.341us 10.114us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 60.47% 2.642ms 60.47% 2.642ms 2.642ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn3 2.19% 98.331us 39.82% 1.786ms 1.786ms 0.000us 0.00% 3.885ms 3.885ms 1 + FlashAttnFunc 1.99% 89.333us 37.63% 1.688ms 562.551us 0.000us 0.00% 3.885ms 1.295ms 3 + _flash_attn3_48fe103_dirty::fwd 1.08% 48.311us 35.64% 1.598ms 532.773us 2.912ms 100.00% 3.885ms 1.295ms 3 + hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.914ms 100.05% 2.914ms 2.914ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.912ms 100.00% 2.912ms 970.802us 3 + Activity Buffer Request 33.18% 1.488ms 33.18% 1.488ms 1.488ms 972.637us 33.40% 972.637us 972.637us 1 + aten::empty 0.57% 25.370us 0.57% 25.370us 4.228us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.13% 5.730us 0.13% 5.730us 1.910us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.69% 30.861us 0.69% 30.861us 10.287us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 60.18% 2.699ms 60.18% 2.699ms 2.699ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.368ms -Self CUDA time total: 2.856ms +Self CPU time total: 4.485ms +Self CUDA time total: 2.912ms @@ -3991,19 +3991,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L384_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn3 2.61% 122.474us 42.62% 2.001ms 2.001ms 0.000us 0.00% 3.906ms 3.906ms 1 - FlashAttnFunc 1.99% 93.683us 40.01% 1.879ms 626.332us 0.000us 0.00% 3.906ms 1.302ms 3 - _flash_attn3_48fe103_dirty::fwd 1.17% 54.872us 38.02% 1.785ms 595.104us 2.915ms 100.00% 3.906ms 1.302ms 3 - hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.917ms 100.05% 2.917ms 2.917ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.915ms 100.00% 2.915ms 971.727us 3 - Activity Buffer Request 31.11% 1.461ms 31.11% 1.461ms 1.461ms 991.129us 34.00% 991.129us 991.129us 1 - aten::empty 0.59% 27.622us 0.59% 27.622us 4.604us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.12% 5.820us 0.12% 5.820us 1.940us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 5.03% 236.178us 5.03% 236.178us 78.726us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 57.38% 2.695ms 57.38% 2.695ms 2.695ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn3 2.51% 118.553us 41.81% 1.973ms 1.973ms 0.000us 0.00% 3.964ms 3.964ms 1 + FlashAttnFunc 1.94% 91.662us 39.30% 1.855ms 618.205us 0.000us 0.00% 3.964ms 1.321ms 3 + _flash_attn3_48fe103_dirty::fwd 1.07% 50.373us 37.36% 1.763ms 587.651us 2.962ms 100.00% 3.964ms 1.321ms 3 + hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.964ms 100.05% 2.964ms 2.964ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.962ms 100.00% 2.962ms 987.401us 3 + Activity Buffer Request 30.92% 1.459ms 30.92% 1.459ms 1.459ms 1.002ms 33.82% 1.002ms 1.002ms 1 + aten::empty 0.56% 26.451us 0.56% 26.451us 4.408us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.11% 5.270us 0.11% 5.270us 1.757us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 4.70% 221.845us 4.70% 221.845us 73.948us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 58.19% 2.746ms 58.19% 2.746ms 2.746ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.696ms -Self CUDA time total: 2.915ms +Self CPU time total: 4.719ms +Self CUDA time total: 2.962ms @@ -4013,19 +4013,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L448_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn3 2.45% 124.235us 37.18% 1.882ms 1.882ms 0.000us 0.00% 4.537ms 4.537ms 1 - FlashAttnFunc 1.83% 92.522us 34.73% 1.758ms 585.897us 0.000us 0.00% 4.537ms 1.512ms 3 - _flash_attn3_48fe103_dirty::fwd 1.03% 52.313us 32.90% 1.665ms 555.056us 3.398ms 100.00% 4.537ms 1.512ms 3 - hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.399ms 100.05% 3.399ms 3.399ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.398ms 100.00% 3.398ms 1.133ms 3 - Activity Buffer Request 27.82% 1.408ms 27.82% 1.408ms 1.408ms 1.139ms 33.52% 1.139ms 1.139ms 1 - aten::empty 0.54% 27.441us 0.54% 27.441us 4.573us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.12% 5.839us 0.12% 5.839us 1.946us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 3.39% 171.646us 3.39% 171.646us 57.215us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 62.82% 3.179ms 62.82% 3.179ms 3.179ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn3 2.19% 114.453us 37.34% 1.953ms 1.953ms 0.000us 0.00% 4.662ms 4.662ms 1 + FlashAttnFunc 1.73% 90.401us 35.15% 1.838ms 612.822us 0.000us 0.00% 4.662ms 1.554ms 3 + _flash_attn3_48fe103_dirty::fwd 0.97% 50.643us 33.42% 1.748ms 582.688us 3.490ms 100.00% 4.662ms 1.554ms 3 + hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.492ms 100.04% 3.492ms 3.492ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.490ms 100.00% 3.490ms 1.163ms 3 + Activity Buffer Request 28.44% 1.487ms 28.44% 1.487ms 1.487ms 1.171ms 33.56% 1.171ms 1.171ms 1 + aten::empty 0.52% 27.271us 0.52% 27.271us 4.545us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.09% 4.950us 0.09% 4.950us 1.650us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 3.40% 178.024us 3.40% 178.024us 59.341us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 62.66% 3.277ms 62.66% 3.277ms 3.277ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 5.061ms -Self CUDA time total: 3.398ms +Self CPU time total: 5.230ms +Self CUDA time total: 3.490ms @@ -4035,33 +4035,33 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L512_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn3 2.74% 138.223us 36.95% 1.864ms 1.864ms 0.000us 0.00% 4.557ms 4.557ms 1 - FlashAttnFunc 1.84% 92.725us 34.21% 1.726ms 575.197us 0.000us 0.00% 4.557ms 1.519ms 3 - _flash_attn3_48fe103_dirty::fwd 1.03% 52.171us 32.37% 1.633ms 544.289us 3.424ms 100.00% 4.557ms 1.519ms 3 - hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.425ms 100.04% 3.425ms 3.425ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.424ms 100.00% 3.424ms 1.141ms 3 - Activity Buffer Request 27.34% 1.379ms 27.34% 1.379ms 1.379ms 1.133ms 33.10% 1.133ms 1.133ms 1 - aten::empty 0.57% 28.661us 0.57% 28.661us 4.777us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.10% 5.240us 0.10% 5.240us 1.747us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 3.33% 167.776us 3.33% 167.776us 55.925us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 63.05% 3.181ms 63.05% 3.181ms 3.181ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn3 2.26% 115.663us 36.27% 1.854ms 1.854ms 0.000us 0.00% 4.679ms 4.679ms 1 + FlashAttnFunc 2.25% 114.773us 34.01% 1.738ms 579.364us 0.000us 0.00% 4.679ms 1.560ms 3 + _flash_attn3_48fe103_dirty::fwd 1.02% 51.933us 31.76% 1.623ms 541.107us 3.499ms 100.00% 4.679ms 1.560ms 3 + hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.500ms 100.04% 3.500ms 3.500ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.499ms 100.00% 3.499ms 1.166ms 3 + Activity Buffer Request 26.80% 1.370ms 26.80% 1.370ms 1.370ms 1.181ms 33.75% 1.181ms 1.181ms 1 + aten::empty 0.54% 27.681us 0.54% 27.681us 4.613us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.10% 5.079us 0.10% 5.079us 1.693us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 3.30% 168.813us 3.30% 168.813us 56.271us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 63.73% 3.257ms 63.73% 3.257ms 3.257ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 5.045ms -Self CUDA time total: 3.424ms +Self CPU time total: 5.111ms +Self CUDA time total: 3.499ms impl wl p50(ms) ok -hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.92 True -hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.96 True -hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.02 True -hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.02 True -hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.18 True +hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.94 True +hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.97 True +hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.04 True +hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.05 True +hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.20 True hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.18 True
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s] -Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.27it/s] -Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.55it/s] +Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.38it/s] +Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.75it/s]

Artifacts:

diff --git a/flash_attn/impls/mem_efficient_attention.html b/flash_attn/impls/mem_efficient_attention.html index 9e30082f387ab5511025b216cc2dd03e743dccac..f6ab4e24cf377304db7fbbedb7a4571918177b17 100644 --- a/flash_attn/impls/mem_efficient_attention.html +++ b/flash_attn/impls/mem_efficient_attention.html @@ -3869,9 +3869,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ code ▼ output - ▶ uv-logs + ▶ uv-logs | -Cell: benchmark | 32.68s +Cell: benchmark | 3.92s | Raw @@ -3924,28 +3924,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L128_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_mem_eff 4.77% 340.490us 32.91% 2.350ms 2.350ms 0.000us 0.00% 5.530ms 5.530ms 1 - torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.523ms 100.81% 5.523ms 5.523ms 1 - aten::scaled_dot_product_attention 0.44% 31.421us 2.67% 190.938us 63.646us 0.000us 0.00% 4.861ms 1.620ms 3 - aten::_scaled_dot_product_efficient_attention 0.35% 24.771us 2.23% 159.517us 53.172us 0.000us 0.00% 4.861ms 1.620ms 3 - aten::_efficient_attention_forward 0.51% 36.163us 1.50% 107.413us 35.804us 4.861ms 88.73% 4.861ms 1.620ms 3 -fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.861ms 88.73% 4.861ms 1.620ms 3 - aten::contiguous 0.17% 12.232us 24.52% 1.751ms 194.525us 0.000us 0.00% 668.128us 74.236us 9 - aten::clone 0.48% 34.579us 24.35% 1.738ms 193.165us 0.000us 0.00% 668.128us 74.236us 9 - aten::copy_ 1.16% 82.494us 22.79% 1.628ms 180.845us 617.312us 11.27% 668.128us 74.236us 9 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 617.312us 11.27% 617.312us 68.590us 9 - Activity Buffer Request 20.35% 1.453ms 20.35% 1.453ms 1.453ms 50.816us 0.93% 50.816us 50.816us 1 - aten::transpose 1.00% 71.754us 1.33% 95.065us 3.961us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.33% 23.311us 0.33% 23.311us 0.971us 0.000us 0.00% 0.000us 0.000us 24 - aten::empty_like 0.27% 19.481us 1.07% 76.301us 8.478us 0.000us 0.00% 0.000us 0.000us 9 - aten::empty 1.26% 89.759us 1.26% 89.759us 4.274us 0.000us 0.00% 0.000us 0.000us 21 - cudaLaunchKernel 1.62% 115.656us 1.62% 115.656us 9.638us 0.000us 0.00% 0.000us 0.000us 12 - cudaStreamIsCapturing 0.04% 2.980us 0.04% 2.980us 0.993us 0.000us 0.00% 0.000us 0.000us 3 - cudaFuncSetAttribute 0.16% 11.490us 0.16% 11.490us 3.830us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 67.09% 4.790ms 67.09% 4.790ms 4.790ms 0.000us 0.00% 0.000us 0.000us 1 + torch_mem_eff 4.77% 333.269us 32.71% 2.284ms 2.284ms 0.000us 0.00% 5.420ms 5.420ms 1 + torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.402ms 100.61% 5.402ms 5.402ms 1 + aten::scaled_dot_product_attention 0.44% 30.450us 2.54% 177.435us 59.145us 0.000us 0.00% 4.753ms 1.584ms 3 + aten::_scaled_dot_product_efficient_attention 0.33% 22.722us 2.10% 146.985us 48.995us 0.000us 0.00% 4.753ms 1.584ms 3 + aten::_efficient_attention_forward 0.51% 35.382us 1.42% 99.273us 33.091us 4.753ms 88.51% 4.753ms 1.584ms 3 +fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.753ms 88.51% 4.753ms 1.584ms 3 + aten::contiguous 0.17% 11.660us 24.51% 1.712ms 190.185us 0.000us 0.00% 667.266us 74.141us 9 + aten::clone 0.46% 31.810us 24.34% 1.700ms 188.889us 0.000us 0.00% 667.266us 74.141us 9 + aten::copy_ 1.01% 70.871us 22.86% 1.597ms 177.404us 616.738us 11.49% 667.266us 74.141us 9 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 616.738us 11.49% 616.738us 68.526us 9 + Activity Buffer Request 20.64% 1.441ms 20.64% 1.441ms 1.441ms 50.528us 0.94% 50.528us 50.528us 1 + aten::transpose 0.91% 63.619us 1.25% 87.011us 3.625us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.33% 23.392us 0.33% 23.392us 0.975us 0.000us 0.00% 0.000us 0.000us 24 + aten::empty_like 0.24% 16.972us 1.02% 71.553us 7.950us 0.000us 0.00% 0.000us 0.000us 9 + aten::empty 1.18% 82.691us 1.18% 82.691us 3.938us 0.000us 0.00% 0.000us 0.000us 21 + cudaLaunchKernel 1.55% 108.383us 1.55% 108.383us 9.032us 0.000us 0.00% 0.000us 0.000us 12 + cudaStreamIsCapturing 0.05% 3.260us 0.05% 3.260us 1.087us 0.000us 0.00% 0.000us 0.000us 3 + cudaFuncSetAttribute 0.12% 8.450us 0.12% 8.450us 2.817us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 67.29% 4.700ms 67.29% 4.700ms 4.700ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 7.140ms -Self CUDA time total: 5.479ms +Self CPU time total: 6.984ms +Self CUDA time total: 5.369ms @@ -3955,28 +3955,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L256_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_mem_eff 3.38% 251.986us 27.98% 2.086ms 2.086ms 0.000us 0.00% 6.014ms 6.014ms 1 - torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.969ms 100.15% 5.969ms 5.969ms 1 - aten::scaled_dot_product_attention 0.27% 19.962us 1.97% 146.646us 48.882us 0.000us 0.00% 5.323ms 1.774ms 3 - aten::_scaled_dot_product_efficient_attention 0.26% 19.141us 1.70% 126.684us 42.228us 0.000us 0.00% 5.323ms 1.774ms 3 - aten::_efficient_attention_forward 0.39% 29.281us 1.12% 83.514us 27.838us 5.323ms 89.32% 5.323ms 1.774ms 3 -fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.323ms 89.32% 5.323ms 1.774ms 3 - aten::contiguous 0.10% 7.510us 22.05% 1.644ms 182.655us 0.000us 0.00% 690.909us 76.768us 9 - aten::clone 0.31% 23.251us 21.95% 1.636ms 181.821us 0.000us 0.00% 690.909us 76.768us 9 - aten::copy_ 0.91% 68.131us 20.95% 1.562ms 173.540us 636.478us 10.68% 690.909us 76.768us 9 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 636.478us 10.68% 636.478us 70.720us 9 - Activity Buffer Request 19.09% 1.423ms 19.09% 1.423ms 1.423ms 54.431us 0.91% 54.431us 54.431us 1 - aten::transpose 0.68% 50.542us 0.90% 67.292us 2.804us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.22% 16.750us 0.22% 16.750us 0.698us 0.000us 0.00% 0.000us 0.000us 24 - aten::empty_like 0.17% 12.371us 0.69% 51.272us 5.697us 0.000us 0.00% 0.000us 0.000us 9 - aten::empty 0.87% 64.771us 0.87% 64.771us 3.084us 0.000us 0.00% 0.000us 0.000us 21 - cudaLaunchKernel 1.25% 93.466us 1.25% 93.466us 7.789us 0.000us 0.00% 0.000us 0.000us 12 - cudaStreamIsCapturing 0.03% 2.400us 0.03% 2.400us 0.800us 0.000us 0.00% 0.000us 0.000us 3 - cudaFuncSetAttribute 0.05% 3.371us 0.05% 3.371us 1.124us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 72.02% 5.368ms 72.02% 5.368ms 5.368ms 0.000us 0.00% 0.000us 0.000us 1 + torch_mem_eff 3.53% 251.015us 29.52% 2.098ms 2.098ms 0.000us 0.00% 5.633ms 5.633ms 1 + torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.587ms 100.15% 5.587ms 5.587ms 1 + aten::scaled_dot_product_attention 0.25% 17.630us 2.05% 145.594us 48.531us 0.000us 0.00% 4.943ms 1.648ms 3 + aten::_scaled_dot_product_efficient_attention 0.28% 19.810us 1.80% 127.964us 42.655us 0.000us 0.00% 4.943ms 1.648ms 3 + aten::_efficient_attention_forward 0.42% 29.862us 1.18% 83.512us 27.837us 4.943ms 88.61% 4.943ms 1.648ms 3 +fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.943ms 88.61% 4.943ms 1.648ms 3 + aten::contiguous 0.10% 7.191us 23.30% 1.656ms 184.002us 0.000us 0.00% 689.540us 76.616us 9 + aten::clone 0.33% 23.318us 23.20% 1.649ms 183.203us 0.000us 0.00% 689.540us 76.616us 9 + aten::copy_ 0.92% 65.725us 22.12% 1.572ms 174.717us 635.140us 11.39% 689.540us 76.616us 9 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 635.140us 11.39% 635.140us 70.571us 9 + Activity Buffer Request 20.24% 1.439ms 20.24% 1.439ms 1.439ms 54.400us 0.98% 54.400us 54.400us 1 + aten::transpose 0.71% 50.494us 0.99% 70.123us 2.922us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.28% 19.629us 0.28% 19.629us 0.818us 0.000us 0.00% 0.000us 0.000us 24 + aten::empty_like 0.18% 12.608us 0.75% 53.061us 5.896us 0.000us 0.00% 0.000us 0.000us 9 + aten::empty 0.94% 66.903us 0.94% 66.903us 3.186us 0.000us 0.00% 0.000us 0.000us 21 + cudaLaunchKernel 1.25% 89.012us 1.25% 89.012us 7.418us 0.000us 0.00% 0.000us 0.000us 12 + cudaStreamIsCapturing 0.03% 2.220us 0.03% 2.220us 0.740us 0.000us 0.00% 0.000us 0.000us 3 + cudaFuncSetAttribute 0.05% 3.880us 0.05% 3.880us 1.293us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 70.48% 5.009ms 70.48% 5.009ms 5.009ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 7.454ms -Self CUDA time total: 5.959ms +Self CPU time total: 7.107ms +Self CUDA time total: 5.578ms @@ -3986,28 +3986,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L320_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_mem_eff 3.08% 235.490us 27.25% 2.083ms 2.083ms 0.000us 0.00% 6.182ms 6.182ms 1 - torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.132ms 100.15% 6.132ms 6.132ms 1 - aten::scaled_dot_product_attention 0.24% 18.220us 1.86% 142.046us 47.349us 0.000us 0.00% 5.466ms 1.822ms 3 - aten::_scaled_dot_product_efficient_attention 0.24% 18.131us 1.62% 123.826us 41.275us 0.000us 0.00% 5.466ms 1.822ms 3 - aten::_efficient_attention_forward 0.37% 27.940us 1.08% 82.291us 27.430us 5.466ms 89.28% 5.466ms 1.822ms 3 -fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.466ms 89.28% 5.466ms 1.822ms 3 - aten::contiguous 0.10% 7.272us 21.47% 1.642ms 182.409us 0.000us 0.00% 715.197us 79.466us 9 - aten::clone 0.29% 22.290us 21.38% 1.634ms 181.601us 0.000us 0.00% 715.197us 79.466us 9 - aten::copy_ 0.83% 63.251us 20.39% 1.559ms 173.182us 656.318us 10.72% 715.197us 79.466us 9 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 656.318us 10.72% 656.318us 72.924us 9 - Activity Buffer Request 18.70% 1.430ms 18.70% 1.430ms 1.430ms 58.879us 0.96% 58.879us 58.879us 1 - aten::transpose 0.93% 71.209us 1.15% 87.625us 3.651us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.21% 16.416us 0.21% 16.416us 0.684us 0.000us 0.00% 0.000us 0.000us 24 - aten::empty_like 0.15% 11.741us 0.70% 53.481us 5.942us 0.000us 0.00% 0.000us 0.000us 9 - aten::empty 0.89% 67.840us 0.89% 67.840us 3.230us 0.000us 0.00% 0.000us 0.000us 21 - cudaLaunchKernel 1.15% 88.022us 1.15% 88.022us 7.335us 0.000us 0.00% 0.000us 0.000us 12 - cudaStreamIsCapturing 0.03% 2.651us 0.03% 2.651us 0.884us 0.000us 0.00% 0.000us 0.000us 3 - cudaFuncSetAttribute 0.04% 3.370us 0.04% 3.370us 1.123us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 72.75% 5.562ms 72.75% 5.562ms 5.562ms 0.000us 0.00% 0.000us 0.000us 1 + torch_mem_eff 3.28% 246.598us 28.54% 2.146ms 2.146ms 0.000us 0.00% 6.014ms 6.014ms 1 + torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.967ms 100.18% 5.967ms 5.967ms 1 + aten::scaled_dot_product_attention 0.24% 18.181us 1.92% 144.583us 48.194us 0.000us 0.00% 5.302ms 1.767ms 3 + aten::_scaled_dot_product_efficient_attention 0.27% 19.980us 1.68% 126.402us 42.134us 0.000us 0.00% 5.302ms 1.767ms 3 + aten::_efficient_attention_forward 0.38% 28.571us 1.10% 82.521us 27.507us 5.302ms 89.01% 5.302ms 1.767ms 3 +fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.302ms 89.01% 5.302ms 1.767ms 3 + aten::contiguous 0.09% 6.930us 22.70% 1.707ms 189.666us 0.000us 0.00% 712.547us 79.172us 9 + aten::clone 0.30% 22.691us 22.61% 1.700ms 188.896us 0.000us 0.00% 712.547us 79.172us 9 + aten::copy_ 1.08% 81.024us 21.57% 1.622ms 180.228us 654.403us 10.99% 712.547us 79.172us 9 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 654.403us 10.99% 654.403us 72.711us 9 + Activity Buffer Request 19.57% 1.471ms 19.57% 1.471ms 1.471ms 58.144us 0.98% 58.144us 58.144us 1 + aten::transpose 0.68% 51.431us 0.95% 71.351us 2.973us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.26% 19.920us 0.26% 19.920us 0.830us 0.000us 0.00% 0.000us 0.000us 24 + aten::empty_like 0.16% 11.979us 0.74% 55.320us 6.147us 0.000us 0.00% 0.000us 0.000us 9 + aten::empty 0.93% 69.561us 0.93% 69.561us 3.312us 0.000us 0.00% 0.000us 0.000us 21 + cudaLaunchKernel 1.22% 91.652us 1.22% 91.652us 7.638us 0.000us 0.00% 0.000us 0.000us 12 + cudaStreamIsCapturing 0.03% 2.359us 0.03% 2.359us 0.786us 0.000us 0.00% 0.000us 0.000us 3 + cudaFuncSetAttribute 0.05% 3.430us 0.05% 3.430us 1.143us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 71.46% 5.373ms 71.46% 5.373ms 5.373ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 7.646ms -Self CUDA time total: 6.123ms +Self CPU time total: 7.519ms +Self CUDA time total: 5.956ms @@ -4017,28 +4017,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L384_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_mem_eff 2.84% 224.838us 29.78% 2.354ms 2.354ms 0.000us 0.00% 6.170ms 6.170ms 1 - torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.121ms 100.15% 6.121ms 6.121ms 1 - aten::scaled_dot_product_attention 0.24% 18.891us 1.82% 143.646us 47.882us 0.000us 0.00% 5.458ms 1.819ms 3 - aten::_scaled_dot_product_efficient_attention 0.24% 19.093us 1.58% 124.755us 41.585us 0.000us 0.00% 5.458ms 1.819ms 3 - aten::_efficient_attention_forward 0.36% 28.140us 1.04% 82.213us 27.404us 5.458ms 89.30% 5.458ms 1.819ms 3 -fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.458ms 89.30% 5.458ms 1.819ms 3 - aten::contiguous 0.10% 7.739us 24.57% 1.942ms 215.806us 0.000us 0.00% 711.998us 79.111us 9 - aten::clone 0.31% 24.450us 24.47% 1.935ms 214.946us 0.000us 0.00% 711.998us 79.111us 9 - aten::copy_ 0.86% 68.064us 23.51% 1.859ms 206.523us 653.982us 10.70% 711.998us 79.111us 9 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 653.982us 10.70% 653.982us 72.665us 9 - Activity Buffer Request 18.84% 1.489ms 18.84% 1.489ms 1.489ms 58.016us 0.95% 58.016us 58.016us 1 - aten::transpose 0.62% 49.288us 0.84% 66.489us 2.770us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.22% 17.201us 0.22% 17.201us 0.717us 0.000us 0.00% 0.000us 0.000us 24 - aten::empty_like 0.15% 12.041us 0.65% 51.362us 5.707us 0.000us 0.00% 0.000us 0.000us 9 - aten::empty 0.83% 65.351us 0.83% 65.351us 3.112us 0.000us 0.00% 0.000us 0.000us 21 - cudaLaunchKernel 4.09% 323.234us 4.09% 323.234us 26.936us 0.000us 0.00% 0.000us 0.000us 12 - cudaStreamIsCapturing 0.03% 2.670us 0.03% 2.670us 0.890us 0.000us 0.00% 0.000us 0.000us 3 - cudaFuncSetAttribute 0.04% 3.430us 0.04% 3.430us 1.143us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 70.22% 5.551ms 70.22% 5.551ms 5.551ms 0.000us 0.00% 0.000us 0.000us 1 + torch_mem_eff 3.21% 251.576us 29.97% 2.347ms 2.347ms 0.000us 0.00% 6.116ms 6.116ms 1 + torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.068ms 100.14% 6.068ms 6.068ms 1 + aten::scaled_dot_product_attention 0.24% 18.800us 1.87% 146.693us 48.898us 0.000us 0.00% 5.408ms 1.803ms 3 + aten::_scaled_dot_product_efficient_attention 0.25% 19.900us 1.63% 127.893us 42.631us 0.000us 0.00% 5.408ms 1.803ms 3 + aten::_efficient_attention_forward 0.38% 29.372us 1.07% 83.903us 27.968us 5.408ms 89.25% 5.408ms 1.803ms 3 +fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.408ms 89.25% 5.408ms 1.803ms 3 + aten::contiguous 0.10% 7.511us 24.29% 1.902ms 211.340us 0.000us 0.00% 708.735us 78.748us 9 + aten::clone 0.28% 21.872us 24.19% 1.895ms 210.505us 0.000us 0.00% 708.735us 78.748us 9 + aten::copy_ 0.85% 66.540us 23.20% 1.817ms 201.834us 651.551us 10.75% 708.735us 78.748us 9 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 651.551us 10.75% 651.551us 72.395us 9 + Activity Buffer Request 18.68% 1.462ms 18.68% 1.462ms 1.462ms 57.184us 0.94% 57.184us 57.184us 1 + aten::transpose 0.65% 50.781us 0.90% 70.402us 2.933us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.25% 19.621us 0.25% 19.621us 0.818us 0.000us 0.00% 0.000us 0.000us 24 + aten::empty_like 0.15% 11.809us 0.72% 56.170us 6.241us 0.000us 0.00% 0.000us 0.000us 9 + aten::empty 0.90% 70.242us 0.90% 70.242us 3.345us 0.000us 0.00% 0.000us 0.000us 21 + cudaLaunchKernel 3.97% 310.797us 3.97% 310.797us 25.900us 0.000us 0.00% 0.000us 0.000us 12 + cudaStreamIsCapturing 0.03% 2.250us 0.03% 2.250us 0.750us 0.000us 0.00% 0.000us 0.000us 3 + cudaFuncSetAttribute 0.04% 3.220us 0.04% 3.220us 1.073us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 70.03% 5.484ms 70.03% 5.484ms 5.484ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 7.905ms -Self CUDA time total: 6.112ms +Self CPU time total: 7.830ms +Self CUDA time total: 6.059ms @@ -4048,28 +4048,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L448_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_mem_eff 2.78% 220.799us 28.42% 2.258ms 2.258ms 0.000us 0.00% 6.296ms 6.296ms 1 - torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.245ms 100.15% 6.245ms 6.245ms 1 - aten::scaled_dot_product_attention 0.24% 19.311us 1.79% 142.116us 47.372us 0.000us 0.00% 5.574ms 1.858ms 3 - aten::_scaled_dot_product_efficient_attention 0.23% 17.909us 1.55% 122.805us 40.935us 0.000us 0.00% 5.574ms 1.858ms 3 - aten::_efficient_attention_forward 0.36% 28.682us 1.03% 82.073us 27.358us 5.574ms 89.39% 5.574ms 1.858ms 3 -fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.574ms 89.39% 5.574ms 1.858ms 3 - aten::contiguous 0.09% 7.009us 23.32% 1.852ms 205.811us 0.000us 0.00% 721.599us 80.178us 9 - aten::clone 0.28% 22.450us 23.23% 1.845ms 205.033us 0.000us 0.00% 721.599us 80.178us 9 - aten::copy_ 0.87% 68.713us 22.33% 1.774ms 197.096us 661.695us 10.61% 721.599us 80.178us 9 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 661.695us 10.61% 661.695us 73.522us 9 - Activity Buffer Request 17.91% 1.422ms 17.91% 1.422ms 1.422ms 59.904us 0.96% 59.904us 59.904us 1 - aten::transpose 0.61% 48.435us 0.82% 65.304us 2.721us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.21% 16.869us 0.21% 16.869us 0.703us 0.000us 0.00% 0.000us 0.000us 24 - aten::empty_like 0.14% 11.511us 0.62% 48.982us 5.442us 0.000us 0.00% 0.000us 0.000us 9 - aten::empty 0.78% 61.691us 0.78% 61.691us 2.938us 0.000us 0.00% 0.000us 0.000us 21 - cudaLaunchKernel 3.85% 305.580us 3.85% 305.580us 25.465us 0.000us 0.00% 0.000us 0.000us 12 - cudaStreamIsCapturing 0.03% 2.440us 0.03% 2.440us 0.813us 0.000us 0.00% 0.000us 0.000us 3 - cudaFuncSetAttribute 0.05% 3.920us 0.05% 3.920us 1.307us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 71.58% 5.685ms 71.58% 5.685ms 5.685ms 0.000us 0.00% 0.000us 0.000us 1 + torch_mem_eff 3.15% 250.575us 28.50% 2.270ms 2.270ms 0.000us 0.00% 6.322ms 6.322ms 1 + torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.270ms 100.14% 6.270ms 6.270ms 1 + aten::scaled_dot_product_attention 0.22% 17.572us 1.82% 145.084us 48.361us 0.000us 0.00% 5.598ms 1.866ms 3 + aten::_scaled_dot_product_efficient_attention 0.24% 19.250us 1.60% 127.512us 42.504us 0.000us 0.00% 5.598ms 1.866ms 3 + aten::_efficient_attention_forward 0.36% 28.812us 1.05% 83.962us 27.987us 5.598ms 89.40% 5.598ms 1.866ms 3 +fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.598ms 89.40% 5.598ms 1.866ms 3 + aten::contiguous 0.09% 6.912us 22.94% 1.827ms 203.045us 0.000us 0.00% 724.000us 80.444us 9 + aten::clone 0.28% 21.949us 22.86% 1.820ms 202.277us 0.000us 0.00% 724.000us 80.444us 9 + aten::copy_ 0.82% 65.091us 21.89% 1.744ms 193.745us 664.032us 10.60% 724.000us 80.444us 9 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 664.032us 10.60% 664.032us 73.781us 9 + Activity Buffer Request 18.02% 1.435ms 18.02% 1.435ms 1.435ms 59.968us 0.96% 59.968us 59.968us 1 + aten::transpose 0.64% 50.930us 0.89% 70.859us 2.952us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.25% 19.929us 0.25% 19.929us 0.830us 0.000us 0.00% 0.000us 0.000us 24 + aten::empty_like 0.15% 12.022us 0.69% 54.843us 6.094us 0.000us 0.00% 0.000us 0.000us 9 + aten::empty 0.87% 69.430us 0.87% 69.430us 3.306us 0.000us 0.00% 0.000us 0.000us 21 + cudaLaunchKernel 3.34% 266.388us 3.34% 266.388us 22.199us 0.000us 0.00% 0.000us 0.000us 12 + cudaStreamIsCapturing 0.03% 2.320us 0.03% 2.320us 0.773us 0.000us 0.00% 0.000us 0.000us 3 + cudaFuncSetAttribute 0.04% 3.120us 0.04% 3.120us 1.040us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 71.50% 5.695ms 71.50% 5.695ms 5.695ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 7.943ms -Self CUDA time total: 6.236ms +Self CPU time total: 7.965ms +Self CUDA time total: 6.262ms @@ -4079,90 +4079,38 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L512_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_mem_eff 3.27% 267.711us 29.30% 2.401ms 2.401ms 0.000us 0.00% 6.459ms 6.459ms 1 - torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.406ms 100.13% 6.406ms 6.406ms 1 - aten::scaled_dot_product_attention 0.24% 19.643us 1.85% 151.176us 50.392us 0.000us 0.00% 5.726ms 1.909ms 3 - aten::_scaled_dot_product_efficient_attention 0.26% 20.920us 1.61% 131.533us 43.844us 0.000us 0.00% 5.726ms 1.909ms 3 - aten::_efficient_attention_forward 0.37% 30.563us 1.03% 84.603us 28.201us 5.726ms 89.50% 5.726ms 1.909ms 3 -fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.726ms 89.50% 5.726ms 1.909ms 3 - aten::contiguous 0.09% 7.670us 23.58% 1.932ms 214.647us 0.000us 0.00% 733.247us 81.472us 9 - aten::clone 0.31% 25.042us 23.48% 1.924ms 213.795us 0.000us 0.00% 733.247us 81.472us 9 - aten::copy_ 0.88% 72.162us 22.52% 1.845ms 205.052us 671.711us 10.50% 733.247us 81.472us 9 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 671.711us 10.50% 671.711us 74.635us 9 - Activity Buffer Request 17.78% 1.456ms 17.78% 1.456ms 1.456ms 61.536us 0.96% 61.536us 61.536us 1 - aten::transpose 0.71% 58.110us 0.93% 75.842us 3.160us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.22% 17.732us 0.22% 17.732us 0.739us 0.000us 0.00% 0.000us 0.000us 24 - aten::empty_like 0.15% 12.319us 0.65% 53.641us 5.960us 0.000us 0.00% 0.000us 0.000us 9 - aten::empty 0.81% 66.513us 0.81% 66.513us 3.167us 0.000us 0.00% 0.000us 0.000us 21 - cudaLaunchKernel 4.14% 339.159us 4.14% 339.159us 28.263us 0.000us 0.00% 0.000us 0.000us 12 - cudaStreamIsCapturing 0.03% 2.379us 0.03% 2.379us 0.793us 0.000us 0.00% 0.000us 0.000us 3 - cudaFuncSetAttribute 0.05% 4.230us 0.05% 4.230us 1.410us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 70.70% 5.793ms 70.70% 5.793ms 5.793ms 0.000us 0.00% 0.000us 0.000us 1 + torch_mem_eff 3.00% 248.403us 26.98% 2.232ms 2.232ms 0.000us 0.00% 6.668ms 6.668ms 1 + torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.616ms 100.13% 6.616ms 6.616ms 1 + aten::scaled_dot_product_attention 0.21% 17.221us 1.72% 142.654us 47.551us 0.000us 0.00% 5.939ms 1.980ms 3 + aten::_scaled_dot_product_efficient_attention 0.23% 18.779us 1.52% 125.433us 41.811us 0.000us 0.00% 5.939ms 1.980ms 3 + aten::_efficient_attention_forward 0.34% 28.440us 0.99% 81.712us 27.237us 5.939ms 89.88% 5.939ms 1.980ms 3 +fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.939ms 89.88% 5.939ms 1.980ms 3 + aten::contiguous 0.08% 6.861us 21.66% 1.792ms 199.142us 0.000us 0.00% 729.440us 81.049us 9 + aten::clone 0.26% 21.352us 21.58% 1.785ms 198.379us 0.000us 0.00% 729.440us 81.049us 9 + aten::copy_ 0.83% 69.012us 20.65% 1.709ms 189.858us 668.928us 10.12% 729.440us 81.049us 9 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 668.928us 10.12% 668.928us 74.325us 9 + Activity Buffer Request 17.29% 1.430ms 17.29% 1.430ms 1.430ms 60.512us 0.92% 60.512us 60.512us 1 + aten::transpose 0.63% 51.780us 0.89% 73.784us 3.074us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.27% 22.004us 0.27% 22.004us 0.917us 0.000us 0.00% 0.000us 0.000us 24 + aten::empty_like 0.14% 11.870us 0.67% 55.340us 6.149us 0.000us 0.00% 0.000us 0.000us 9 + aten::empty 0.84% 69.312us 0.84% 69.312us 3.301us 0.000us 0.00% 0.000us 0.000us 21 + cudaLaunchKernel 2.79% 231.145us 2.79% 231.145us 19.262us 0.000us 0.00% 0.000us 0.000us 12 + cudaStreamIsCapturing 0.03% 2.280us 0.03% 2.280us 0.760us 0.000us 0.00% 0.000us 0.000us 3 + cudaFuncSetAttribute 0.04% 3.570us 0.04% 3.570us 1.190us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 73.02% 6.041ms 73.02% 6.041ms 6.041ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 8.193ms -Self CUDA time total: 6.398ms +Self CPU time total: 8.273ms +Self CUDA time total: 6.608ms impl wl p50(ms) ok -torch_mem_eff cuda_attn_L128_bfloat16 1.86 True -torch_mem_eff cuda_attn_L256_bfloat16 1.97 True -torch_mem_eff cuda_attn_L320_bfloat16 2.04 True -torch_mem_eff cuda_attn_L384_bfloat16 2.06 True -torch_mem_eff cuda_attn_L448_bfloat16 2.03 True +torch_mem_eff cuda_attn_L128_bfloat16 1.83 True +torch_mem_eff cuda_attn_L256_bfloat16 1.89 True +torch_mem_eff cuda_attn_L320_bfloat16 2.00 True +torch_mem_eff cuda_attn_L384_bfloat16 1.97 True +torch_mem_eff cuda_attn_L448_bfloat16 2.06 True torch_mem_eff cuda_attn_L512_bfloat16 2.19 True
-
-
▶ UV Install Logs
- -

Artifacts:

attention.jsonl diff --git a/flash_attn/impls/sage_attention.html b/flash_attn/impls/sage_attention.html index 573b17feaed54b9320d6ff8e360dfa03da8f3be9..9d07a2ce157ec6414ddbe4c27bea52ef7ed253b0 100644 --- a/flash_attn/impls/sage_attention.html +++ b/flash_attn/impls/sage_attention.html @@ -3869,9 +3869,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ code ▼ output - ▶ uv-logs + ▶ uv-logs | -Cell: benchmark | 4.22s +Cell: benchmark | 4.53s | Raw @@ -3920,23 +3920,28 @@ Cell: benchmark | 4.22s
Running attention benchmark on cuda with 6 workloads.
 impl                     wl                  p50(ms)  ok
 sage_int8_fp16           cuda_attn_L128_bfloat16    FAIL  False
-  Error: module 'sage_attention_fd11035eb4318b27' has no attribute 'fwd'
+  Error: module 'sage_attention_5c963cbdaf16559b' has no attribute 'fwd'
 sage_int8_fp16           cuda_attn_L256_bfloat16    FAIL  False
-  Error: module 'sage_attention_fd11035eb4318b27' has no attribute 'fwd'
+  Error: module 'sage_attention_5c963cbdaf16559b' has no attribute 'fwd'
 sage_int8_fp16           cuda_attn_L320_bfloat16    FAIL  False
-  Error: module 'sage_attention_fd11035eb4318b27' has no attribute 'fwd'
+  Error: module 'sage_attention_5c963cbdaf16559b' has no attribute 'fwd'
 sage_int8_fp16           cuda_attn_L384_bfloat16    FAIL  False
-  Error: module 'sage_attention_fd11035eb4318b27' has no attribute 'fwd'
+  Error: module 'sage_attention_5c963cbdaf16559b' has no attribute 'fwd'
 sage_int8_fp16           cuda_attn_L448_bfloat16    FAIL  False
-  Error: module 'sage_attention_fd11035eb4318b27' has no attribute 'fwd'
+  Error: module 'sage_attention_5c963cbdaf16559b' has no attribute 'fwd'
 sage_int8_fp16           cuda_attn_L512_bfloat16    FAIL  False
-  Error: module 'sage_attention_fd11035eb4318b27' has no attribute 'fwd'
+  Error: module 'sage_attention_5c963cbdaf16559b' has no attribute 'fwd'
 
-
-Fetching 11 files: 0%| | 0/11 [00:00<?, ?it/s] -Fetching 11 files: 73%|███████▎ | 8/11 [00:00<00:00, 13.92it/s] -Fetching 11 files: 100%|██████████| 11/11 [00:00<00:00, 19.13it/s] +
+
▶ UV Install Logs
+ +
+
Fetching 11 files: 0%| | 0/11 [00:00<?, ?it/s] +Fetching 11 files: 18%|█▊ | 2/11 [00:00<00:00, 15.79it/s] +Fetching 11 files: 73%|███████▎ | 8/11 [00:00<00:00, 13.55it/s] +Fetching 11 files: 100%|██████████| 11/11 [00:00<00:00, 18.83it/s]

Artifacts:

attention.jsonl diff --git a/flash_attn/impls/xformers.html b/flash_attn/impls/xformers.html index 2dadff0b53907b1426c870df5e01dac812507a43..6363e024de1afb10cb31713f99cf844d998ebe90 100644 --- a/flash_attn/impls/xformers.html +++ b/flash_attn/impls/xformers.html @@ -3923,21 +3923,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L128_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - xformers_meff 9.93% 451.937us 49.71% 2.262ms 2.262ms 0.000us 0.00% 3.695ms 3.695ms 1 - xformers_flash3::flash_fwd 4.26% 193.656us 38.96% 1.773ms 590.904us 0.000us 0.00% 3.695ms 1.232ms 3 - flash_attn_3::fwd 1.62% 73.841us 34.71% 1.579ms 526.352us 2.795ms 100.00% 3.695ms 1.232ms 3 - xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.797ms 100.05% 2.797ms 2.797ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.795ms 100.00% 2.795ms 931.773us 3 - Activity Buffer Request 31.17% 1.418ms 31.17% 1.418ms 1.418ms 899.421us 32.18% 899.421us 899.421us 1 - aten::empty 0.76% 34.741us 0.76% 34.741us 5.790us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.30% 13.732us 0.30% 13.732us 4.577us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 0.85% 38.662us 0.85% 38.662us 12.887us 0.000us 0.00% 0.000us 0.000us 3 - aten::reshape 0.35% 15.860us 0.82% 37.181us 6.197us 0.000us 0.00% 0.000us 0.000us 6 - aten::view 0.47% 21.321us 0.47% 21.321us 3.553us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 50.29% 2.288ms 50.29% 2.288ms 2.288ms 0.000us 0.00% 0.000us 0.000us 1 + xformers_meff 10.85% 481.112us 51.55% 2.285ms 2.285ms 0.000us 0.00% 3.582ms 3.582ms 1 + xformers_flash3::flash_fwd 4.56% 202.185us 39.85% 1.766ms 588.715us 0.000us 0.00% 3.582ms 1.194ms 3 + flash_attn_3::fwd 1.68% 74.662us 35.29% 1.564ms 521.320us 2.681ms 100.00% 3.582ms 1.194ms 3 + xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.682ms 100.06% 2.682ms 2.682ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.681ms 100.00% 2.681ms 893.515us 3 + Activity Buffer Request 31.74% 1.407ms 31.74% 1.407ms 1.407ms 901.761us 33.64% 901.761us 901.761us 1 + aten::empty 0.77% 33.920us 0.77% 33.920us 5.653us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.23% 10.152us 0.23% 10.152us 3.384us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.87% 38.521us 0.87% 38.521us 12.840us 0.000us 0.00% 0.000us 0.000us 3 + aten::reshape 0.29% 13.028us 0.85% 37.710us 6.285us 0.000us 0.00% 0.000us 0.000us 6 + aten::view 0.56% 24.682us 0.56% 24.682us 4.114us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 48.45% 2.147ms 48.45% 2.147ms 2.147ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.550ms -Self CUDA time total: 2.795ms +Self CPU time total: 4.432ms +Self CUDA time total: 2.681ms @@ -3947,21 +3947,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L256_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - xformers_meff 6.95% 312.321us 44.96% 2.021ms 2.021ms 0.000us 0.00% 3.832ms 3.832ms 1 - xformers_flash3::flash_fwd 3.14% 141.315us 37.51% 1.686ms 561.970us 0.000us 0.00% 3.832ms 1.277ms 3 - flash_attn_3::fwd 1.18% 53.030us 34.37% 1.545ms 514.865us 2.890ms 100.00% 3.832ms 1.277ms 3 - xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.892ms 100.05% 2.892ms 2.892ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.890ms 100.00% 2.890ms 963.329us 3 - Activity Buffer Request 31.64% 1.422ms 31.64% 1.422ms 1.422ms 942.465us 32.61% 942.465us 942.465us 1 - aten::empty 0.68% 30.660us 0.68% 30.660us 5.110us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.12% 5.592us 0.12% 5.592us 1.864us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 0.74% 33.432us 0.74% 33.432us 11.144us 0.000us 0.00% 0.000us 0.000us 3 - aten::reshape 0.20% 8.951us 0.50% 22.691us 3.782us 0.000us 0.00% 0.000us 0.000us 6 - aten::view 0.31% 13.740us 0.31% 13.740us 2.290us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 55.04% 2.474ms 55.04% 2.474ms 2.474ms 0.000us 0.00% 0.000us 0.000us 1 + xformers_meff 7.16% 317.438us 45.96% 2.036ms 2.036ms 0.000us 0.00% 3.779ms 3.779ms 1 + xformers_flash3::flash_fwd 3.35% 148.243us 38.25% 1.695ms 564.991us 0.000us 0.00% 3.779ms 1.260ms 3 + flash_attn_3::fwd 1.25% 55.403us 34.91% 1.547ms 515.576us 2.825ms 100.00% 3.779ms 1.260ms 3 + xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.827ms 100.05% 2.827ms 2.827ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.825ms 100.00% 2.825ms 941.739us 3 + Activity Buffer Request 32.14% 1.424ms 32.14% 1.424ms 1.424ms 954.080us 33.77% 954.080us 954.080us 1 + aten::empty 0.63% 27.720us 0.63% 27.720us 4.620us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.12% 5.400us 0.12% 5.400us 1.800us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.77% 34.161us 0.77% 34.161us 11.387us 0.000us 0.00% 0.000us 0.000us 3 + aten::reshape 0.21% 9.370us 0.54% 23.750us 3.958us 0.000us 0.00% 0.000us 0.000us 6 + aten::view 0.32% 14.380us 0.32% 14.380us 2.397us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 54.04% 2.395ms 54.04% 2.395ms 2.395ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.495ms -Self CUDA time total: 2.890ms +Self CPU time total: 4.431ms +Self CUDA time total: 2.825ms @@ -3971,21 +3971,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L320_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - xformers_meff 6.65% 298.008us 44.73% 2.006ms 2.006ms 0.000us 0.00% 3.867ms 3.867ms 1 - xformers_flash3::flash_fwd 3.15% 141.235us 37.58% 1.685ms 561.690us 0.000us 0.00% 3.867ms 1.289ms 3 - flash_attn_3::fwd 1.18% 53.120us 34.43% 1.544ms 514.611us 2.888ms 100.00% 3.867ms 1.289ms 3 - xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.890ms 100.06% 2.890ms 2.890ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.888ms 100.00% 2.888ms 962.683us 3 - Activity Buffer Request 31.72% 1.422ms 31.72% 1.422ms 1.422ms 978.939us 33.90% 978.939us 978.939us 1 - aten::empty 0.67% 30.192us 0.67% 30.192us 5.032us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.12% 5.491us 0.12% 5.491us 1.830us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 0.73% 32.901us 0.73% 32.901us 10.967us 0.000us 0.00% 0.000us 0.000us 3 - aten::reshape 0.20% 8.773us 0.50% 22.603us 3.767us 0.000us 0.00% 0.000us 0.000us 6 - aten::view 0.31% 13.830us 0.31% 13.830us 2.305us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 55.27% 2.478ms 55.27% 2.478ms 2.478ms 0.000us 0.00% 0.000us 0.000us 1 + xformers_meff 6.87% 310.027us 44.72% 2.018ms 2.018ms 0.000us 0.00% 3.923ms 3.923ms 1 + xformers_flash3::flash_fwd 3.22% 145.444us 37.33% 1.684ms 561.324us 0.000us 0.00% 3.923ms 1.308ms 3 + flash_attn_3::fwd 1.15% 52.002us 34.10% 1.539ms 512.843us 2.919ms 100.00% 3.923ms 1.308ms 3 + xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.921ms 100.06% 2.921ms 2.921ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.919ms 100.00% 2.919ms 973.037us 3 + Activity Buffer Request 31.44% 1.418ms 31.44% 1.418ms 1.418ms 1.004ms 34.40% 1.004ms 1.004ms 1 + aten::empty 0.63% 28.392us 0.63% 28.392us 4.732us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.12% 5.520us 0.12% 5.520us 1.840us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.76% 34.420us 0.76% 34.420us 11.473us 0.000us 0.00% 0.000us 0.000us 3 + aten::reshape 0.21% 9.519us 0.52% 23.650us 3.942us 0.000us 0.00% 0.000us 0.000us 6 + aten::view 0.31% 14.131us 0.31% 14.131us 2.355us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 55.28% 2.494ms 55.28% 2.494ms 2.494ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.484ms -Self CUDA time total: 2.888ms +Self CPU time total: 4.511ms +Self CUDA time total: 2.919ms @@ -3995,21 +3995,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L384_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - xformers_meff 6.31% 299.042us 46.56% 2.205ms 2.205ms 0.000us 0.00% 3.936ms 3.936ms 1 - xformers_flash3::flash_fwd 2.97% 140.784us 39.75% 1.883ms 627.609us 0.000us 0.00% 3.936ms 1.312ms 3 - flash_attn_3::fwd 1.10% 52.191us 36.78% 1.742ms 580.681us 2.941ms 100.00% 3.936ms 1.312ms 3 - xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.943ms 100.05% 2.943ms 2.943ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.941ms 100.00% 2.941ms 980.445us 3 - Activity Buffer Request 30.11% 1.426ms 30.11% 1.426ms 1.426ms 994.973us 33.83% 994.973us 994.973us 1 - aten::empty 0.64% 30.333us 0.64% 30.333us 5.055us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.11% 5.440us 0.11% 5.440us 1.813us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 4.81% 227.898us 4.81% 227.898us 75.966us 0.000us 0.00% 0.000us 0.000us 3 - aten::reshape 0.19% 8.769us 0.49% 23.220us 3.870us 0.000us 0.00% 0.000us 0.000us 6 - aten::view 0.31% 14.451us 0.31% 14.451us 2.409us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 53.44% 2.531ms 53.44% 2.531ms 2.531ms 0.000us 0.00% 0.000us 0.000us 1 + xformers_meff 6.73% 317.798us 47.46% 2.241ms 2.241ms 0.000us 0.00% 3.892ms 3.892ms 1 + xformers_flash3::flash_fwd 3.10% 146.544us 40.23% 1.900ms 633.169us 0.000us 0.00% 3.892ms 1.297ms 3 + flash_attn_3::fwd 1.15% 54.462us 37.13% 1.753ms 584.321us 2.910ms 100.00% 3.892ms 1.297ms 3 + xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.911ms 100.05% 2.911ms 2.911ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.910ms 100.00% 2.910ms 969.848us 3 + Activity Buffer Request 30.01% 1.417ms 30.01% 1.417ms 1.417ms 982.915us 33.78% 982.915us 982.915us 1 + aten::empty 0.62% 29.170us 0.62% 29.170us 4.862us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.11% 5.370us 0.11% 5.370us 1.790us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 5.23% 247.156us 5.23% 247.156us 82.385us 0.000us 0.00% 0.000us 0.000us 3 + aten::reshape 0.20% 9.560us 0.50% 23.460us 3.910us 0.000us 0.00% 0.000us 0.000us 6 + aten::view 0.29% 13.900us 0.29% 13.900us 2.317us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 52.54% 2.481ms 52.54% 2.481ms 2.481ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.736ms -Self CUDA time total: 2.941ms +Self CPU time total: 4.721ms +Self CUDA time total: 2.910ms @@ -4019,21 +4019,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L448_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - xformers_meff 5.82% 299.962us 41.73% 2.152ms 2.152ms 0.000us 0.00% 4.566ms 4.566ms 1 - xformers_flash3::flash_fwd 2.76% 142.114us 35.47% 1.829ms 609.751us 0.000us 0.00% 4.566ms 1.522ms 3 - flash_attn_3::fwd 1.04% 53.631us 32.71% 1.687ms 562.380us 3.419ms 100.00% 4.566ms 1.522ms 3 - xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.420ms 100.05% 3.420ms 3.420ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.419ms 100.00% 3.419ms 1.140ms 3 - Activity Buffer Request 27.56% 1.422ms 27.56% 1.422ms 1.422ms 1.148ms 33.58% 1.148ms 1.148ms 1 - aten::empty 0.60% 31.172us 0.60% 31.172us 5.195us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.11% 5.431us 0.11% 5.431us 1.810us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 3.40% 175.366us 3.40% 175.366us 58.455us 0.000us 0.00% 0.000us 0.000us 3 - aten::reshape 0.17% 8.849us 0.45% 23.030us 3.838us 0.000us 0.00% 0.000us 0.000us 6 - aten::view 0.27% 14.181us 0.27% 14.181us 2.363us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 58.27% 3.005ms 58.27% 3.005ms 3.005ms 0.000us 0.00% 0.000us 0.000us 1 + xformers_meff 5.86% 306.369us 41.94% 2.193ms 2.193ms 0.000us 0.00% 4.614ms 4.614ms 1 + xformers_flash3::flash_fwd 2.85% 149.202us 35.63% 1.863ms 620.885us 0.000us 0.00% 4.614ms 1.538ms 3 + flash_attn_3::fwd 1.03% 53.951us 32.77% 1.713ms 571.151us 3.461ms 100.00% 4.614ms 1.538ms 3 + xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.462ms 100.04% 3.462ms 3.462ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.461ms 100.00% 3.461ms 1.154ms 3 + Activity Buffer Request 27.28% 1.426ms 27.28% 1.426ms 1.426ms 1.153ms 33.31% 1.153ms 1.153ms 1 + aten::empty 0.55% 28.813us 0.55% 28.813us 4.802us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.11% 5.560us 0.11% 5.560us 1.853us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 3.80% 198.684us 3.80% 198.684us 66.228us 0.000us 0.00% 0.000us 0.000us 3 + aten::reshape 0.18% 9.430us 0.46% 23.930us 3.988us 0.000us 0.00% 0.000us 0.000us 6 + aten::view 0.28% 14.500us 0.28% 14.500us 2.417us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 58.06% 3.036ms 58.06% 3.036ms 3.036ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 5.157ms -Self CUDA time total: 3.419ms +Self CPU time total: 5.228ms +Self CUDA time total: 3.461ms @@ -4043,37 +4043,37 @@ PROFILE TRACE: xformers_meff | cuda_attn_L512_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - xformers_meff 5.76% 295.800us 41.67% 2.139ms 2.139ms 0.000us 0.00% 4.557ms 4.557ms 1 - xformers_flash3::flash_fwd 2.75% 141.044us 35.47% 1.821ms 606.924us 0.000us 0.00% 4.557ms 1.519ms 3 - flash_attn_3::fwd 1.04% 53.523us 32.72% 1.680ms 559.910us 3.405ms 100.00% 4.557ms 1.519ms 3 - xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.407ms 100.05% 3.407ms 3.407ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.405ms 100.00% 3.405ms 1.135ms 3 - Activity Buffer Request 27.67% 1.420ms 27.67% 1.420ms 1.420ms 1.152ms 33.82% 1.152ms 1.152ms 1 - aten::empty 0.60% 30.610us 0.60% 30.610us 5.102us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.12% 6.310us 0.12% 6.310us 2.103us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 3.29% 168.946us 3.29% 168.946us 56.315us 0.000us 0.00% 0.000us 0.000us 3 - aten::reshape 0.17% 8.721us 0.44% 22.392us 3.732us 0.000us 0.00% 0.000us 0.000us 6 - aten::view 0.27% 13.671us 0.27% 13.671us 2.279us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 58.33% 2.994ms 58.33% 2.994ms 2.994ms 0.000us 0.00% 0.000us 0.000us 1 + xformers_meff 5.96% 310.158us 41.66% 2.167ms 2.167ms 0.000us 0.00% 4.643ms 4.643ms 1 + xformers_flash3::flash_fwd 2.83% 146.954us 35.22% 1.832ms 610.728us 0.000us 0.00% 4.643ms 1.548ms 3 + flash_attn_3::fwd 1.00% 51.911us 32.40% 1.685ms 561.744us 3.464ms 100.00% 4.643ms 1.548ms 3 + xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.465ms 100.04% 3.465ms 3.465ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.464ms 100.00% 3.464ms 1.155ms 3 + Activity Buffer Request 27.49% 1.430ms 27.49% 1.430ms 1.430ms 1.179ms 34.05% 1.179ms 1.179ms 1 + aten::empty 0.54% 28.311us 0.54% 28.311us 4.719us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.11% 5.750us 0.11% 5.750us 1.917us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 3.25% 169.084us 3.25% 169.084us 56.361us 0.000us 0.00% 0.000us 0.000us 3 + aten::reshape 0.17% 8.670us 0.48% 24.720us 4.120us 0.000us 0.00% 0.000us 0.000us 6 + aten::view 0.31% 16.050us 0.31% 16.050us 2.675us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 58.34% 3.035ms 58.34% 3.035ms 3.035ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 5.133ms -Self CUDA time total: 3.405ms +Self CPU time total: 5.202ms +Self CUDA time total: 3.464ms impl wl p50(ms) ok -xformers_meff cuda_attn_L128_bfloat16 0.98 True +xformers_meff cuda_attn_L128_bfloat16 1.00 True xformers_meff cuda_attn_L256_bfloat16 1.03 True xformers_meff cuda_attn_L320_bfloat16 1.08 True -xformers_meff cuda_attn_L384_bfloat16 1.10 True -xformers_meff cuda_attn_L448_bfloat16 1.23 True -xformers_meff cuda_attn_L512_bfloat16 1.22 True +xformers_meff cuda_attn_L384_bfloat16 1.09 True +xformers_meff cuda_attn_L448_bfloat16 1.25 True +xformers_meff cuda_attn_L512_bfloat16 1.24 True
▶ UV Install Logs
diff --git a/flash_attn/results/artifacts/combine/latency.svg b/flash_attn/results/artifacts/combine/latency.svg index 689e04d1be57a1e800f341bc84fe4bfaf1387666..19f0903d77a8fb32c0a3ed03553c82706371801e 100644 --- a/flash_attn/results/artifacts/combine/latency.svg +++ b/flash_attn/results/artifacts/combine/latency.svg @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:168c229932ad06a68508a4a77b66485ff9bcf48ed736a5ffdd003f5cb9e8e639 -size 24777 +oid sha256:0a7d7b3dc8fc6b60a4b9f8bfcf3e229706548b71a8174822b89cc9a2746d3bbd +size 24787 diff --git a/flash_attn/results/combined_results.html b/flash_attn/results/combined_results.html index 7a5f09ca394f53e1d971ad7b608a69d09750ab95..3a2204532e0ec8ef3588194f5c38935fb60f8208 100644 --- a/flash_attn/results/combined_results.html +++ b/flash_attn/results/combined_results.html @@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: - 2025-10-28T14:09:17.505622 + 2025-10-29T14:28:03.109695 image/svg+xml @@ -3982,96 +3982,96 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: - + - + - 1.0 + 1.0 - + - + - 1.2 + 1.2 - + - + - 1.4 + 1.4 - + - + - 1.6 + 1.6 - + - + - 1.8 + 1.8 - + - + - 2.0 + 2.0 - + - + - 2.2 + 2.2 @@ -4079,73 +4079,73 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: - + - - - - - - + + + + + + - + - - - - - + + + + + - + - - - - - - + + + + + + - + - - - - - - + + + + + + - + - - - - - + + + + + @@ -4337,48 +4337,48 @@ Summary: 6 found, 0 skipped, 0 missing COMBINED BENCHMARK SUMMARY impl wl p50(ms) ok -hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.96 True +hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.97 True hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.01 True hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.06 True -hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.05 True -hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.22 True -hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.21 True -hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.92 True -hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.96 True -hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.02 True -hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.02 True -hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.18 True +hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.09 True +hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.24 True +hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.23 True +hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.94 True +hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.97 True +hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.04 True +hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.05 True +hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.20 True hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.18 True sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False - Error: module 'sage_attention_fd11035eb4318b27' has no attribute 'fwd' + Error: module 'sage_attention_5c963cbdaf16559b' has no attribute 'fwd' sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False - Error: module 'sage_attention_fd11035eb4318b27' has no attribute 'fwd' + Error: module 'sage_attention_5c963cbdaf16559b' has no attribute 'fwd' sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False - Error: module 'sage_attention_fd11035eb4318b27' has no attribute 'fwd' + Error: module 'sage_attention_5c963cbdaf16559b' has no attribute 'fwd' sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False - Error: module 'sage_attention_fd11035eb4318b27' has no attribute 'fwd' + Error: module 'sage_attention_5c963cbdaf16559b' has no attribute 'fwd' sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False - Error: module 'sage_attention_fd11035eb4318b27' has no attribute 'fwd' + Error: module 'sage_attention_5c963cbdaf16559b' has no attribute 'fwd' sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False - Error: module 'sage_attention_fd11035eb4318b27' has no attribute 'fwd' + Error: module 'sage_attention_5c963cbdaf16559b' has no attribute 'fwd' torch_flash_ma cuda_attn_L128_bfloat16 1.22 True -torch_flash_ma cuda_attn_L256_bfloat16 1.27 True -torch_flash_ma cuda_attn_L320_bfloat16 1.28 True -torch_flash_ma cuda_attn_L384_bfloat16 1.31 True +torch_flash_ma cuda_attn_L256_bfloat16 1.28 True +torch_flash_ma cuda_attn_L320_bfloat16 1.29 True +torch_flash_ma cuda_attn_L384_bfloat16 1.33 True torch_flash_ma cuda_attn_L448_bfloat16 1.47 True torch_flash_ma cuda_attn_L512_bfloat16 1.50 True -torch_mem_eff cuda_attn_L128_bfloat16 1.86 True -torch_mem_eff cuda_attn_L256_bfloat16 1.97 True -torch_mem_eff cuda_attn_L320_bfloat16 2.04 True -torch_mem_eff cuda_attn_L384_bfloat16 2.06 True -torch_mem_eff cuda_attn_L448_bfloat16 2.03 True +torch_mem_eff cuda_attn_L128_bfloat16 1.83 True +torch_mem_eff cuda_attn_L256_bfloat16 1.89 True +torch_mem_eff cuda_attn_L320_bfloat16 2.00 True +torch_mem_eff cuda_attn_L384_bfloat16 1.97 True +torch_mem_eff cuda_attn_L448_bfloat16 2.06 True torch_mem_eff cuda_attn_L512_bfloat16 2.19 True -xformers_meff cuda_attn_L128_bfloat16 0.98 True +xformers_meff cuda_attn_L128_bfloat16 1.00 True xformers_meff cuda_attn_L256_bfloat16 1.03 True xformers_meff cuda_attn_L320_bfloat16 1.08 True -xformers_meff cuda_attn_L384_bfloat16 1.10 True -xformers_meff cuda_attn_L448_bfloat16 1.23 True -xformers_meff cuda_attn_L512_bfloat16 1.22 True +xformers_meff cuda_attn_L384_bfloat16 1.09 True +xformers_meff cuda_attn_L448_bfloat16 1.25 True +xformers_meff cuda_attn_L512_bfloat16 1.24 True GENERATING COMBINED VISUALIZATION @@ -4402,7 +4402,7 @@ Implementations included:
▶ UV Install Logs
@@ -4415,7 +4415,7 @@ Installed 37 packages in 187ms - 2025-10-28T14:09:17.505622 + 2025-10-29T14:28:03.109695 image/svg+xml @@ -4525,96 +4525,96 @@ Installed 37 packages in 187ms - + - + - 1.0 + 1.0 - + - + - 1.2 + 1.2 - + - + - 1.4 + 1.4 - + - + - 1.6 + 1.6 - + - + - 1.8 + 1.8 - + - + - 2.0 + 2.0 - + - + - 2.2 + 2.2 @@ -4622,73 +4622,73 @@ Installed 37 packages in 187ms - + - - - - - - + + + + + + - + - - - - - + + + + + - + - - - - - - + + + + + + - + - - - - - - + + + + + + - + - - - - - + + + + + diff --git a/index.html b/index.html index 33ea1b019a71f451c81dbc10c5e67f8c6ca9b465..1061b4b3222caa3480fdd412bcf6f18bb97b54f9 100644 --- a/index.html +++ b/index.html @@ -1,89 +1,4029 @@ - + - - - Index of / - + + + index + + + + + -

Index of /

- +
+
+ + ← back + +
light
+
reset
+ +
+
+ +
+
Generated on:
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35 +
+
+ +
+

All Benchmarks Aggregated Report

+

Layer Norm

+
+ + +
+ + + + + + + + + + + + + + + + + + +
ImplementationDescription
HF Kernels Layer NormHuggingFace kernels implementation
PyTorch Layer NormPyTorch native implementation
+

Rotary Position Embeddings

+
+ + +
+ + + + + + + + + + + + + + + + + + +
ImplementationDescription
HF Kernels RotaryHuggingFace kernels implementation
PyTorch RotaryPyTorch native implementation
+

Flash Attention

+
+ + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ImplementationDescription
Flash AttentionFlash Attention implementation
HF Kernels Flash AttentionHuggingFace kernels Flash Attention
HF Kernels Flash Attention 3HuggingFace kernels Flash Attention 3
Memory Efficient AttentionMemory efficient attention implementation
Sage AttentionSage attention implementation
xFormersxFormers attention implementation
+

Causal Conv1D

+
+ + +
+ + + + + + + + + + + + + + + + + + +
ImplementationDescription
HF Kernels Causal Conv1DHuggingFace kernels implementation
PyTorch Causal Conv1DPyTorch native implementation
+

Activation

+
+ + +
+ + + + + + + + + + + + + + + + + + +
ImplementationDescription
HF Kernels SwiGLUHuggingFace kernels SwiGLU implementation
PyTorch SwiGLUPyTorch native SwiGLU implementation
+

ReLU

+
+ + +
+ + + + + + + + + + + + + + + + + + +
ImplementationDescription
HF Kernels ReLUHuggingFace kernels ReLU implementation
PyTorch ReLUPyTorch native ReLU implementation
+
+ \ No newline at end of file diff --git a/layer_norm/impls/artifacts/benchmark/layer_norm.jsonl b/layer_norm/impls/artifacts/benchmark/layer_norm.jsonl index fcd809d60a69166f4be7343612f4f810d256a506..611975ecd9585a8b6f1198e5f9cf417087baa85d 100644 --- a/layer_norm/impls/artifacts/benchmark/layer_norm.jsonl +++ b/layer_norm/impls/artifacts/benchmark/layer_norm.jsonl @@ -1,4 +1,4 @@ -{"ts": "2025-10-28T14:08:59Z", "run": "c74bca3f7fd14f779c98e3d8b69c0098", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D4096", "batch": 16, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8268990000033227, "p50": 0.8360890000176369, "p90": 0.8378790000733716, "mean": 0.8358750000070359, "iqr": 0.002010000116570154, "raw_times": [0.8426389999840467, 0.8268990000033227, 0.8378790000733716, 0.8360890000176369, 0.8358689999568014], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8452999999235544, "peak_bytes": 2415935488, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null} -{"ts": "2025-10-28T14:09:00Z", "run": "c74bca3f7fd14f779c98e3d8b69c0098", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D8192", "batch": 16, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6477070000746608, "p50": 1.6516379999984565, "p90": 1.6565669999408783, "mean": 1.6533151999965412, "iqr": 0.006360999918797461, "raw_times": [1.6565669999408783, 1.6516379999984565, 1.6477070000746608, 1.6604579999466296, 1.6502060000220808], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6544470000781075, "peak_bytes": 4831870976, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_ref"}, "err": null} -{"ts": "2025-10-28T14:09:00Z", "run": "c74bca3f7fd14f779c98e3d8b69c0098", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D4096", "batch": 16, "seq_len": 4096, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6441269999631913, "p50": 1.6532669999378413, "p90": 1.6534970000066096, "mean": 1.6500411999913922, "iqr": 0.009149999982582813, "raw_times": [1.6441269999631913, 1.6534970000066096, 1.6532669999378413, 1.6443470000240268, 1.654968000025292], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6665570000213847, "peak_bytes": 4831854592, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null} -{"ts": "2025-10-28T14:09:00Z", "run": "c74bca3f7fd14f779c98e3d8b69c0098", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D8192", "batch": 16, "seq_len": 4096, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.251962999911484, "p50": 3.270412999995642, "p90": 3.2735430000911947, "mean": 3.2660931999998866, "iqr": 0.01632000009976764, "raw_times": [3.2735430000911947, 3.251962999911484, 3.257222999991427, 3.277324000009685, 3.270412999995642], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.2640430000583365, "peak_bytes": 9663709184, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1026859283447266e-05, "ref": "layer_norm_ref"}, "err": null} +{"ts": "2025-10-29T14:26:39Z", "run": "a26c482e532b445fa4192ef14ec4850f", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D4096", "batch": 16, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8322699999894212, "p50": 0.8364899999833142, "p90": 0.8382409999967422, "mean": 0.8359703999872181, "iqr": 0.0036810000096920703, "raw_times": [0.8322699999894212, 0.8382909999795629, 0.8345599999870501, 0.8382409999967422, 0.8364899999833142], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8381600000006983, "peak_bytes": 2415935488, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null} +{"ts": "2025-10-29T14:26:39Z", "run": "a26c482e532b445fa4192ef14ec4850f", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D8192", "batch": 16, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6435499999829517, "p50": 1.6499199999771008, "p90": 1.6516499999852385, "mean": 1.650240399987979, "iqr": 0.0024989999474200886, "raw_times": [1.6516499999852385, 1.6435499999829517, 1.6499199999771008, 1.6491510000378184, 1.656930999956785], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.644769999984419, "peak_bytes": 4831870976, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_ref"}, "err": null} +{"ts": "2025-10-29T14:26:39Z", "run": "a26c482e532b445fa4192ef14ec4850f", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D4096", "batch": 16, "seq_len": 4096, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6425610000396773, "p50": 1.6517310000381258, "p90": 1.654420999955164, "mean": 1.6505027999983213, "iqr": 0.006990999963818467, "raw_times": [1.6474299999913455, 1.6517310000381258, 1.654420999955164, 1.6563709999672938, 1.6425610000396773], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6471609999939574, "peak_bytes": 4831854592, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null} +{"ts": "2025-10-29T14:26:39Z", "run": "a26c482e532b445fa4192ef14ec4850f", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D8192", "batch": 16, "seq_len": 4096, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.237169999977141, "p50": 3.2576509999557857, "p90": 3.264301000001524, "mean": 3.257706599993071, "iqr": 0.008230999981151399, "raw_times": [3.2576509999557857, 3.264301000001524, 3.2733410000105323, 3.2560700000203724, 3.237169999977141], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.2725309999932506, "peak_bytes": 9663709184, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1026859283447266e-05, "ref": "layer_norm_ref"}, "err": null} diff --git a/layer_norm/impls/hf_kernels_layer_norm.html b/layer_norm/impls/hf_kernels_layer_norm.html index cfec5b11856445875be968b8022bf0064c0ca56f..9e9cf8da940eb80e201b94351f6e97b42048c103 100644 --- a/layer_norm/impls/hf_kernels_layer_norm.html +++ b/layer_norm/impls/hf_kernels_layer_norm.html @@ -3862,8 +3862,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
-

on_github: huggingface/kernels-uvnotes

-

HF Kernels LayerNorm Implementation

+

HF Kernels LayerNorm Implementation

Based on kernels-community layer-norm kernel.

LayerNorm Benchmark (HF Kernels)

@@ -3873,10 +3872,11 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: benchmark | 7.03s +Cell: benchmark | 6.34s | Raw +GitHub
@@ -3943,19 +3943,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D4096 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 4.56% 180.575us 46.01% 1.822ms 1.822ms 0.000us 0.00% 3.098ms 3.098ms 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 1.70% 67.272us 40.91% 1.619ms 539.829us 2.362ms 100.00% 3.098ms 1.033ms 3 - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.363ms 100.06% 2.363ms 2.363ms 1 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 2.362ms 100.00% 2.362ms 787.305us 3 - Activity Buffer Request 36.75% 1.455ms 36.75% 1.455ms 1.455ms 736.127us 31.17% 736.127us 736.127us 1 - aten::view 0.54% 21.512us 0.54% 21.512us 3.585us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 1.17% 46.231us 1.17% 46.231us 5.137us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.23% 9.070us 0.23% 9.070us 3.023us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 1.06% 41.913us 1.06% 41.913us 13.971us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 53.99% 2.137ms 53.99% 2.137ms 2.137ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 5.26% 209.855us 46.73% 1.864ms 1.864ms 0.000us 0.00% 3.097ms 3.097ms 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 1.78% 70.832us 40.86% 1.630ms 543.337us 2.360ms 100.00% 3.097ms 1.032ms 3 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.362ms 100.06% 2.362ms 2.362ms 1 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 2.360ms 100.00% 2.360ms 786.699us 3 + Activity Buffer Request 36.61% 1.461ms 36.61% 1.461ms 1.461ms 736.736us 31.22% 736.736us 736.736us 1 + aten::view 0.61% 24.271us 0.61% 24.271us 4.045us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 1.19% 47.642us 1.19% 47.642us 5.294us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.27% 10.789us 0.27% 10.789us 3.596us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 1.01% 40.102us 1.01% 40.102us 13.367us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 53.27% 2.125ms 53.27% 2.125ms 2.125ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 3.959ms -Self CUDA time total: 2.362ms +Self CPU time total: 3.989ms +Self CUDA time total: 2.360ms @@ -3965,19 +3965,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D8192 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 2.19% 144.024us 30.18% 1.989ms 1.989ms 0.000us 0.00% 6.322ms 6.322ms 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 0.69% 45.641us 27.80% 1.832ms 610.764us 4.774ms 100.00% 6.322ms 2.107ms 3 - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.776ms 100.03% 4.776ms 4.776ms 1 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.774ms 100.00% 4.774ms 1.591ms 3 - Activity Buffer Request 26.09% 1.720ms 26.09% 1.720ms 1.720ms 1.548ms 32.42% 1.548ms 1.548ms 1 - aten::view 0.20% 12.871us 0.20% 12.871us 2.145us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 0.50% 32.981us 0.50% 32.981us 3.665us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.07% 4.881us 0.07% 4.881us 1.627us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 0.44% 29.151us 0.44% 29.151us 9.717us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 69.82% 4.602ms 69.82% 4.602ms 4.602ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 2.24% 143.733us 27.27% 1.751ms 1.751ms 0.000us 0.00% 6.440ms 6.440ms 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 0.75% 48.181us 24.84% 1.595ms 531.669us 4.846ms 100.00% 6.440ms 2.147ms 3 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.848ms 100.03% 4.848ms 4.848ms 1 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.846ms 100.00% 4.846ms 1.615ms 3 + Activity Buffer Request 23.08% 1.482ms 23.08% 1.482ms 1.482ms 1.594ms 32.88% 1.594ms 1.594ms 1 + aten::view 0.20% 12.572us 0.20% 12.572us 2.095us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 0.46% 29.840us 0.46% 29.840us 3.316us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 5.420us 0.08% 5.420us 1.807us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.46% 29.490us 0.46% 29.490us 9.830us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 72.73% 4.670ms 72.73% 4.670ms 4.670ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 6.591ms -Self CUDA time total: 4.774ms +Self CPU time total: 6.421ms +Self CUDA time total: 4.846ms @@ -3987,19 +3987,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D4096 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 1.89% 121.823us 28.69% 1.852ms 1.852ms 0.000us 0.00% 6.323ms 6.323ms 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 0.69% 44.435us 26.61% 1.718ms 572.663us 4.766ms 100.00% 6.323ms 2.108ms 3 - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.767ms 100.03% 4.767ms 4.767ms 1 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.766ms 100.00% 4.766ms 1.589ms 3 - Activity Buffer Request 24.91% 1.608ms 24.91% 1.608ms 1.608ms 1.557ms 32.67% 1.557ms 1.557ms 1 - aten::view 0.19% 12.441us 0.19% 12.441us 2.074us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 0.50% 32.030us 0.50% 32.030us 3.559us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 4.850us 0.08% 4.850us 1.617us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 0.44% 28.190us 0.44% 28.190us 9.397us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 71.31% 4.604ms 71.31% 4.604ms 4.604ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 1.96% 126.465us 27.43% 1.766ms 1.766ms 0.000us 0.00% 6.435ms 6.435ms 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 0.73% 46.779us 25.26% 1.627ms 542.360us 4.838ms 100.00% 6.435ms 2.145ms 3 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.839ms 100.03% 4.839ms 4.839ms 1 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.838ms 100.00% 4.838ms 1.613ms 3 + Activity Buffer Request 23.54% 1.516ms 23.54% 1.516ms 1.516ms 1.597ms 33.01% 1.597ms 1.597ms 1 + aten::view 0.20% 12.929us 0.20% 12.929us 2.155us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 0.46% 29.911us 0.46% 29.911us 3.323us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 5.300us 0.08% 5.300us 1.767us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.45% 29.003us 0.45% 29.003us 9.668us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 72.57% 4.674ms 72.57% 4.674ms 4.674ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 6.457ms -Self CUDA time total: 4.766ms +Self CPU time total: 6.440ms +Self CUDA time total: 4.838ms @@ -4009,37 +4009,38 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D8192 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 1.32% 150.697us 17.31% 1.975ms 1.975ms 0.000us 0.00% 12.822ms 12.822ms 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 0.42% 47.993us 15.87% 1.810ms 603.497us 9.629ms 100.00% 12.822ms 4.274ms 3 - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.631ms 100.01% 9.631ms 9.631ms 1 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 9.629ms 100.00% 9.629ms 3.210ms 3 - Activity Buffer Request 12.56% 1.433ms 12.56% 1.433ms 1.433ms 3.193ms 33.16% 3.193ms 3.193ms 1 - aten::view 0.12% 13.330us 0.12% 13.330us 2.222us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 0.28% 32.431us 0.28% 32.431us 3.603us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.05% 5.260us 0.05% 5.260us 1.753us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 2.56% 291.579us 2.56% 291.579us 97.193us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 82.69% 9.436ms 82.69% 9.436ms 9.436ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 1.17% 134.085us 17.09% 1.957ms 1.957ms 0.000us 0.00% 12.886ms 12.886ms 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 0.41% 46.869us 15.80% 1.809ms 603.015us 9.665ms 100.00% 12.886ms 4.295ms 3 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.667ms 100.01% 9.667ms 9.667ms 1 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 9.665ms 100.00% 9.665ms 3.222ms 3 + Activity Buffer Request 12.76% 1.462ms 12.76% 1.462ms 1.462ms 3.220ms 33.32% 3.220ms 3.220ms 1 + aten::view 0.12% 13.968us 0.12% 13.968us 2.328us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 0.26% 30.043us 0.26% 30.043us 3.338us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.05% 5.590us 0.05% 5.590us 1.863us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 2.31% 264.797us 2.31% 264.797us 88.266us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 82.91% 9.495ms 82.91% 9.495ms 9.495ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 11.410ms -Self CUDA time total: 9.629ms +Self CPU time total: 11.452ms +Self CUDA time total: 9.665ms impl wl p50(ms) ok hf_kernels_layer_norm LN_B16_S2048_D4096 0.84 True hf_kernels_layer_norm LN_B16_S2048_D8192 1.65 True hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True -hf_kernels_layer_norm LN_B16_S4096_D8192 3.27 True +hf_kernels_layer_norm LN_B16_S4096_D8192 3.26 True
▶ UV Install Logs
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s] -Fetching 4 files: 25%|██▌ | 1/4 [00:00<00:00, 8.47it/s] -Fetching 4 files: 50%|█████ | 2/4 [00:02<00:02, 1.44s/it] -Fetching 4 files: 100%|██████████| 4/4 [00:02<00:00, 1.61it/s]
+Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.22it/s] +Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.44it/s]

Artifacts:

layer_norm.jsonl diff --git a/layer_norm/impls/torch_layer_norm.html b/layer_norm/impls/torch_layer_norm.html index 72ce43dc70edcb0cbcced09b58a31530fadba3d8..f5dd45a5ed15040ec9f80c48eca459fb67a1bc56 100644 --- a/layer_norm/impls/torch_layer_norm.html +++ b/layer_norm/impls/torch_layer_norm.html @@ -3862,8 +3862,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
-

on_github: huggingface/kernels-uvnotes

-

Torch LayerNorm Implementation

+

Torch LayerNorm Implementation

GPU Info

@@ -3872,10 +3871,11 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: nv | 0.22s +Cell: nv | 0.26s | Raw +GitHub
@@ -3887,7 +3887,7 @@ Cell: nv | 0.22s
-
Tue Oct 28 14:08:35 2025       
+
Wed Oct 29 14:26:26 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -3896,7 +3896,7 @@ Cell: nv | 0.22s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   31C    P0            141W /  350W |       0MiB /  46068MiB |     21%      Default |
+| N/A   30C    P0            108W /  350W |       0MiB /  46068MiB |    100%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3920,10 +3920,11 @@ Cell: nv | 0.22s
 ▼ output
  ▶ uv-logs
  | 
-Cell: benchmark | 7.39s
+Cell: benchmark | 7.36s
  | 
 
 Raw
+GitHub
 
@@ -3967,19 +3968,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S2048_D4096 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_layer_norm 3.94% 153.126us 46.06% 1.791ms 1.791ms 0.000us 0.00% 3.027ms 3.027ms 1 - aten::layer_norm 0.44% 17.151us 42.12% 1.638ms 545.972us 0.000us 0.00% 3.027ms 1.009ms 3 - aten::native_layer_norm 1.99% 77.265us 41.68% 1.621ms 540.255us 2.317ms 100.00% 3.027ms 1.009ms 3 + torch_layer_norm 3.90% 151.572us 46.01% 1.786ms 1.786ms 0.000us 0.00% 3.026ms 3.026ms 1 + aten::layer_norm 0.43% 16.762us 42.11% 1.635ms 544.851us 0.000us 0.00% 3.026ms 1.009ms 3 + aten::native_layer_norm 2.06% 80.009us 41.67% 1.618ms 539.263us 2.316ms 100.00% 3.026ms 1.009ms 3 torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.318ms 100.06% 2.318ms 2.318ms 1 -void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 2.317ms 100.00% 2.317ms 772.230us 3 - Activity Buffer Request 37.14% 1.444ms 37.14% 1.444ms 1.444ms 709.980us 30.65% 709.980us 709.980us 1 - aten::empty 1.21% 46.960us 1.21% 46.960us 5.218us 0.000us 0.00% 0.000us 0.000us 9 - cudaLaunchKernel 1.16% 45.271us 1.16% 45.271us 15.090us 0.000us 0.00% 0.000us 0.000us 3 - aten::view 0.18% 7.130us 0.18% 7.130us 1.188us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 53.94% 2.098ms 53.94% 2.098ms 2.098ms 0.000us 0.00% 0.000us 0.000us 1 +void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 2.316ms 100.00% 2.316ms 772.127us 3 + Activity Buffer Request 37.08% 1.440ms 37.08% 1.440ms 1.440ms 709.855us 30.65% 709.855us 709.855us 1 + aten::empty 1.19% 46.261us 1.19% 46.261us 5.140us 0.000us 0.00% 0.000us 0.000us 9 + cudaLaunchKernel 1.16% 45.163us 1.16% 45.163us 15.054us 0.000us 0.00% 0.000us 0.000us 3 + aten::view 0.17% 6.761us 0.17% 6.761us 1.127us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 53.99% 2.096ms 53.99% 2.096ms 2.096ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 3.889ms -Self CUDA time total: 2.317ms +Self CPU time total: 3.882ms +Self CUDA time total: 2.316ms @@ -3989,19 +3990,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S2048_D8192 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_layer_norm 1.11% 71.092us 25.40% 1.622ms 1.622ms 0.000us 0.00% 6.494ms 6.494ms 1 - aten::layer_norm 0.16% 10.119us 24.29% 1.551ms 517.038us 0.000us 0.00% 6.494ms 2.165ms 3 - aten::native_layer_norm 0.82% 52.103us 24.13% 1.541ms 513.665us 4.898ms 100.00% 6.494ms 2.165ms 3 - torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.899ms 100.03% 4.899ms 4.899ms 1 -void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.898ms 100.00% 4.898ms 1.633ms 3 - Activity Buffer Request 22.36% 1.428ms 22.36% 1.428ms 1.428ms 1.596ms 32.59% 1.596ms 1.596ms 1 - aten::empty 0.49% 31.052us 0.49% 31.052us 3.450us 0.000us 0.00% 0.000us 0.000us 9 - cudaLaunchKernel 0.41% 26.160us 0.41% 26.160us 8.720us 0.000us 0.00% 0.000us 0.000us 3 - aten::view 0.06% 3.830us 0.06% 3.830us 0.638us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 74.60% 4.764ms 74.60% 4.764ms 4.764ms 0.000us 0.00% 0.000us 0.000us 1 + torch_layer_norm 1.19% 75.581us 25.55% 1.628ms 1.628ms 0.000us 0.00% 6.473ms 6.473ms 1 + aten::layer_norm 0.14% 9.142us 24.37% 1.553ms 517.550us 0.000us 0.00% 6.473ms 2.158ms 3 + aten::native_layer_norm 0.81% 51.921us 24.22% 1.544ms 514.502us 4.881ms 100.00% 6.473ms 2.158ms 3 + torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.882ms 100.03% 4.882ms 4.882ms 1 +void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.881ms 100.00% 4.881ms 1.627ms 3 + Activity Buffer Request 22.46% 1.431ms 22.46% 1.431ms 1.431ms 1.592ms 32.61% 1.592ms 1.592ms 1 + aten::empty 0.44% 27.841us 0.44% 27.841us 3.093us 0.000us 0.00% 0.000us 0.000us 9 + cudaLaunchKernel 0.45% 28.910us 0.45% 28.910us 9.637us 0.000us 0.00% 0.000us 0.000us 3 + aten::view 0.06% 3.829us 0.06% 3.829us 0.638us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 74.45% 4.743ms 74.45% 4.743ms 4.743ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 6.386ms -Self CUDA time total: 4.898ms +Self CPU time total: 6.372ms +Self CUDA time total: 4.881ms @@ -4011,19 +4012,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S4096_D4096 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_layer_norm 1.17% 72.893us 26.00% 1.616ms 1.616ms 0.000us 0.00% 6.248ms 6.248ms 1 - aten::layer_norm 0.15% 9.290us 24.82% 1.543ms 514.468us 0.000us 0.00% 6.248ms 2.083ms 3 - aten::native_layer_norm 0.84% 52.403us 24.67% 1.534ms 511.371us 4.735ms 100.00% 6.248ms 2.083ms 3 - torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.736ms 100.03% 4.736ms 4.736ms 1 -void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.735ms 100.00% 4.735ms 1.578ms 3 - Activity Buffer Request 22.86% 1.421ms 22.86% 1.421ms 1.421ms 1.513ms 31.96% 1.513ms 1.513ms 1 - aten::empty 0.47% 29.320us 0.47% 29.320us 3.258us 0.000us 0.00% 0.000us 0.000us 9 - cudaLaunchKernel 0.43% 26.781us 0.43% 26.781us 8.927us 0.000us 0.00% 0.000us 0.000us 3 - aten::view 0.07% 4.140us 0.07% 4.140us 0.690us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 74.00% 4.601ms 74.00% 4.601ms 4.601ms 0.000us 0.00% 0.000us 0.000us 1 + torch_layer_norm 1.15% 71.882us 26.71% 1.668ms 1.668ms 0.000us 0.00% 6.222ms 6.222ms 1 + aten::layer_norm 0.15% 9.629us 25.56% 1.596ms 532.153us 0.000us 0.00% 6.222ms 2.074ms 3 + aten::native_layer_norm 0.90% 56.373us 25.41% 1.587ms 528.943us 4.717ms 100.00% 6.222ms 2.074ms 3 + torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.718ms 100.03% 4.718ms 4.718ms 1 +void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.717ms 100.00% 4.717ms 1.572ms 3 + Activity Buffer Request 23.44% 1.464ms 23.44% 1.464ms 1.464ms 1.506ms 31.93% 1.506ms 1.506ms 1 + aten::empty 0.46% 28.850us 0.46% 28.850us 3.206us 0.000us 0.00% 0.000us 0.000us 9 + cudaLaunchKernel 0.52% 32.781us 0.52% 32.781us 10.927us 0.000us 0.00% 0.000us 0.000us 3 + aten::view 0.07% 4.590us 0.07% 4.590us 0.765us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 73.29% 4.577ms 73.29% 4.577ms 4.577ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 6.218ms -Self CUDA time total: 4.735ms +Self CPU time total: 6.246ms +Self CUDA time total: 4.717ms @@ -4033,19 +4034,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S4096_D8192 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_layer_norm 0.66% 74.633us 14.54% 1.650ms 1.650ms 0.000us 0.00% 13.090ms 13.090ms 1 - aten::layer_norm 0.09% 9.800us 13.88% 1.575ms 525.028us 0.000us 0.00% 13.090ms 4.363ms 3 - aten::native_layer_norm 0.45% 51.390us 13.79% 1.565ms 521.762us 9.838ms 100.00% 13.090ms 4.363ms 3 - torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.839ms 100.01% 9.839ms 9.839ms 1 -void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 9.838ms 100.00% 9.838ms 3.279ms 3 - Activity Buffer Request 11.36% 1.289ms 11.36% 1.289ms 1.289ms 3.253ms 33.06% 3.253ms 3.253ms 1 - aten::empty 0.28% 31.381us 0.28% 31.381us 3.487us 0.000us 0.00% 0.000us 0.000us 9 - cudaLaunchKernel 1.67% 189.088us 1.67% 189.088us 63.029us 0.000us 0.00% 0.000us 0.000us 3 - aten::view 0.04% 4.121us 0.04% 4.121us 0.687us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 85.46% 9.697ms 85.46% 9.697ms 9.697ms 0.000us 0.00% 0.000us 0.000us 1 + torch_layer_norm 0.67% 74.340us 13.35% 1.490ms 1.490ms 0.000us 0.00% 13.028ms 13.028ms 1 + aten::layer_norm 0.09% 9.510us 12.69% 1.416ms 471.835us 0.000us 0.00% 13.028ms 4.343ms 3 + aten::native_layer_norm 0.47% 52.269us 12.60% 1.406ms 468.665us 9.808ms 100.00% 13.028ms 4.343ms 3 + torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.809ms 100.02% 9.809ms 9.809ms 1 +void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 9.808ms 100.00% 9.808ms 3.269ms 3 + Activity Buffer Request 9.72% 1.085ms 9.72% 1.085ms 1.085ms 3.220ms 32.83% 3.220ms 3.220ms 1 + aten::empty 0.26% 29.181us 0.26% 29.181us 3.242us 0.000us 0.00% 0.000us 0.000us 9 + cudaLaunchKernel 2.11% 235.817us 2.11% 235.817us 78.606us 0.000us 0.00% 0.000us 0.000us 3 + aten::view 0.04% 4.022us 0.04% 4.022us 0.670us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 86.65% 9.669ms 86.65% 9.669ms 9.669ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 11.347ms -Self CUDA time total: 9.838ms +Self CPU time total: 11.159ms +Self CUDA time total: 9.808ms impl wl p50(ms) ok @@ -4057,7 +4058,7 @@ torch_layer_norm LN_B16_S4096_D8192 3.33 True
▶ UV Install Logs
diff --git a/layer_norm/results/artifacts/combine/latency.svg b/layer_norm/results/artifacts/combine/latency.svg index 51fba97fb0809dfd942d52b9b34e8a096d515676..c17ece602ed5ebc325bf99b71237b08ca31fbe89 100644 --- a/layer_norm/results/artifacts/combine/latency.svg +++ b/layer_norm/results/artifacts/combine/latency.svg @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1e41c135df9f0b506fa1ac950b90bd609d850f01d79b3171b3678c24fdab066a -size 14645 +oid sha256:8fd53794c4617f7e947676c655de6f739b720b8f16a59432369c127bfc08190a +size 14644 diff --git a/layer_norm/results/combined_results.html b/layer_norm/results/combined_results.html index 616fba09e8126d17fe18ed8e4396c65eb84adaef..5a42e66a6787e88853b7090c03ba6d4a8cd04457 100644 --- a/layer_norm/results/combined_results.html +++ b/layer_norm/results/combined_results.html @@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: - 2025-10-28T14:09:21.825978 + 2025-10-29T14:27:45.722521 image/svg+xml @@ -3956,70 +3956,70 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: - + - + - 1.0 + 1.0 - + - + - 1.5 + 1.5 - + - + - 2.0 + 2.0 - + - + - 2.5 + 2.5 - + - + - 3.0 + 3.0 @@ -4027,27 +4027,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: - + - - + + - + - - - - + + + + @@ -4105,7 +4105,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: combine | 4.25s +Cell: combine | 4.21s | Raw @@ -4195,7 +4195,7 @@ impl wl p50(ms) ok hf_kernels_layer_norm LN_B16_S2048_D4096 0.84 True hf_kernels_layer_norm LN_B16_S2048_D8192 1.65 True hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True -hf_kernels_layer_norm LN_B16_S4096_D8192 3.27 True +hf_kernels_layer_norm LN_B16_S4096_D8192 3.26 True torch_layer_norm LN_B16_S2048_D4096 0.82 True torch_layer_norm LN_B16_S2048_D8192 1.68 True torch_layer_norm LN_B16_S4096_D4096 1.61 True @@ -4219,7 +4219,7 @@ Implementations included:
▶ UV Install Logs
@@ -4232,7 +4232,7 @@ Installed 37 packages in 219ms - 2025-10-28T14:09:21.825978 + 2025-10-29T14:27:45.722521 image/svg+xml @@ -4316,70 +4316,70 @@ Installed 37 packages in 219ms - + - + - 1.0 + 1.0 - + - + - 1.5 + 1.5 - + - + - 2.0 + 2.0 - + - + - 2.5 + 2.5 - + - + - 3.0 + 3.0 @@ -4387,27 +4387,27 @@ Installed 37 packages in 219ms - + - - + + - + - - - - + + + + diff --git a/rotary/impls/artifacts/benchmark/rotary.jsonl b/rotary/impls/artifacts/benchmark/rotary.jsonl index 18ed4f37499b08e63b86a43f9ee0bdc193375b0d..e407db0807eb78b1db05edcb765f594b555812aa 100644 --- a/rotary/impls/artifacts/benchmark/rotary.jsonl +++ b/rotary/impls/artifacts/benchmark/rotary.jsonl @@ -1,24 +1,24 @@ -{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H8_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.1724160000549091, "p50": 0.17308600001797458, "p90": 0.1756759999125279, "mean": 0.1760500000045795, "iqr": 0.0032199998258874984, "raw_times": [0.17245600008664042, 0.1756759999125279, 0.1724160000549091, 0.17308600001797458, 0.18661599995084543], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.17975699995531613, "peak_bytes": 1720320, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H8_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22642799990535423, "p50": 0.2294280000114668, "p90": 0.23093799995876907, "mean": 0.23135619996992318, "iqr": 0.0026599999500831473, "raw_times": [0.23093799995876907, 0.22642799990535423, 0.22827800000868592, 0.2417089999653399, 0.2294280000114668], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23494799995660287, "peak_bytes": 3440640, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H32_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21688800006813835, "p50": 0.21992799997860857, "p90": 0.2219079999576934, "mean": 0.22172000001319248, "iqr": 0.004439999884198187, "raw_times": [0.2174680000734952, 0.2219079999576934, 0.21688800006813835, 0.23240799998802686, 0.21992799997860857], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.225418000013633, "peak_bytes": 6832128, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H32_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21487700007583044, "p50": 0.21964699999443837, "p90": 0.22132800006602338, "mean": 0.21978760003094067, "iqr": 0.005100000066704524, "raw_times": [0.21487700007583044, 0.21622799999931885, 0.21964699999443837, 0.22132800006602338, 0.2268580000190923], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.24882799993974913, "peak_bytes": 13664256, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H8_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21332699998311, "p50": 0.21615699995436444, "p90": 0.21744800005762954, "mean": 0.21590960000139603, "iqr": 0.0025000000505315256, "raw_times": [0.21332699998311, 0.21744800005762954, 0.21494800000709802, 0.21766800000477815, 0.21615699995436444], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22001800005000405, "peak_bytes": 6881280, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H8_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21038799991401902, "p50": 0.21561700009442575, "p90": 0.21720800009461527, "mean": 0.22098599999935686, "iqr": 0.004100000182916119, "raw_times": [0.21038799991401902, 0.21720800009461527, 0.21561700009442575, 0.24860899998202513, 0.21310799991169915], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2178580000418151, "peak_bytes": 13762560, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H32_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21404700009952649, "p50": 0.21557699994900759, "p90": 0.2158679999411106, "mean": 0.2152116000161186, "iqr": 0.0011999999287581886, "raw_times": [0.2158679999411106, 0.21589800007859594, 0.21404700009952649, 0.21466800001235242, 0.21557699994900759], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21567799990407366, "peak_bytes": 27328512, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H32_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21701799994389148, "p50": 0.21822700000484474, "p90": 0.22002800005793688, "mean": 0.2237478000097326, "iqr": 0.002031000008173578, "raw_times": [0.22002800005793688, 0.2179970000497633, 0.2454689999922266, 0.21822700000484474, 0.21701799994389148], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22291799996310147, "peak_bytes": 54657024, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H8_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21712800003115262, "p50": 0.21885700004986575, "p90": 0.2196080000658185, "mean": 0.22401780001928273, "iqr": 0.001630000042496249, "raw_times": [0.21797800002332224, 0.2196080000658185, 0.24651799992625456, 0.21885700004986575, 0.21712800003115262], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2206780000051367, "peak_bytes": 27525120, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H8_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21462800009430794, "p50": 0.21782799990432977, "p90": 0.21795700001803198, "mean": 0.21911359999648994, "iqr": 0.0030300000162242213, "raw_times": [0.21462800009430794, 0.23022799996397225, 0.21782799990432977, 0.21492700000180776, 0.21795700001803198], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2186980000260519, "peak_bytes": 55050240, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H32_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21579799999926763, "p50": 0.21701699995446688, "p90": 0.22130799993647088, "mean": 0.2237457999626713, "iqr": 0.004450000005817856, "raw_times": [0.21701699995446688, 0.22130799993647088, 0.21579799999926763, 0.24774799999249808, 0.21685799993065302], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22235700009787251, "peak_bytes": 109314048, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H32_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22434800007431477, "p50": 0.2248280000003433, "p90": 0.22490799995011912, "mean": 0.22479799997654482, "iqr": 0.00031000001854408765, "raw_times": [0.2248280000003433, 0.22490799995011912, 0.22459799993157503, 0.22434800007431477, 0.22530799992637185], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23522799995134847, "peak_bytes": 218628096, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H8_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21574699997017888, "p50": 0.21802799994929956, "p90": 0.21904799996264046, "mean": 0.22033179998288688, "iqr": 0.0018999999156221747, "raw_times": [0.21714800004701829, 0.21802799994929956, 0.2316879999852972, 0.21904799996264046, 0.21574699997017888], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22168800001054478, "peak_bytes": 68698112, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H8_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21753800001533818, "p50": 0.21888800006308884, "p90": 0.22129700005280029, "mean": 0.22190180004599824, "iqr": 0.003358999947522534, "raw_times": [0.21753800001533818, 0.23384799999348616, 0.21793800010527775, 0.21888800006308884, 0.22129700005280029], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22266799999215436, "peak_bytes": 6848512, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H32_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2166670000178783, "p50": 0.21850699999959033, "p90": 0.21964699999443837, "mean": 0.21864339998955984, "iqr": 0.001419000000169035, "raw_times": [0.21850699999959033, 0.2166670000178783, 0.22016799994162284, 0.21822799999426934, 0.21964699999443837], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23552799996195972, "peak_bytes": 13647872, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H32_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21624800001518452, "p50": 0.21773700007088337, "p90": 0.21802799994929956, "mean": 0.21774760000425886, "iqr": 0.0013409999155555852, "raw_times": [0.21668700003374397, 0.21773700007088337, 0.22003799995218287, 0.21624800001518452, 0.21802799994929956], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2226780000000872, "peak_bytes": 27295744, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H8_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21748699998624943, "p50": 0.22014700005001941, "p90": 0.22206799997093185, "mean": 0.22232159999475698, "iqr": 0.0019999999949504854, "raw_times": [0.22014700005001941, 0.23183799999060284, 0.22206799997093185, 0.21748699998624943, 0.22006799997598137], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22040800001832395, "peak_bytes": 13697024, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H8_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21601800006010308, "p50": 0.21957800004202, "p90": 0.22023799999715266, "mean": 0.2213318000030995, "iqr": 0.0024510000002919696, "raw_times": [0.23303799991936103, 0.21601800006010308, 0.2177869999968607, 0.21957800004202, 0.22023799999715266], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.220787999978711, "peak_bytes": 27394048, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H32_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21692799998618284, "p50": 0.22003699996275827, "p90": 0.2230679999684071, "mean": 0.222287800011145, "iqr": 0.0031599998919773498, "raw_times": [0.21692799998618284, 0.21990800007642974, 0.2314980000619471, 0.2230679999684071, 0.22003699996275827], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22102700006598752, "peak_bytes": 54591488, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H32_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2160679999860804, "p50": 0.21972700005790102, "p90": 0.22029800004474964, "mean": 0.21970960001453932, "iqr": 0.0024610000082248007, "raw_times": [0.2160679999860804, 0.2246179999474407, 0.22029800004474964, 0.21972700005790102, 0.21783700003652484], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22191799996562622, "peak_bytes": 109182976, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H8_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2172279999967941, "p50": 0.21847799996521644, "p90": 0.22105800007921061, "mean": 0.22193580000475777, "iqr": 0.0035110000453641987, "raw_times": [0.21847799996521644, 0.22105800007921061, 0.23536799994872126, 0.21754700003384642, 0.2172279999967941], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22206799997093185, "peak_bytes": 54788096, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H8_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21436800000174117, "p50": 0.21785799992812827, "p90": 0.2195579999124675, "mean": 0.2202379999744153, "iqr": 0.0030299999025373836, "raw_times": [0.21436800000174117, 0.21785799992812827, 0.2195579999124675, 0.2165280000099301, 0.2328780000198094], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.25353900002755836, "peak_bytes": 109576192, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H32_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22968799999034673, "p50": 0.23015800002212927, "p90": 0.23064800006977748, "mean": 0.23369620002995362, "iqr": 0.0006600000688194996, "raw_times": [0.23015800002212927, 0.24799900006655662, 0.22968799999034673, 0.22998800000095798, 0.23064800006977748], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23042800000894204, "peak_bytes": 218365952, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H32_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.6347319999804313, "p50": 0.6375930000785957, "p90": 0.639283000055002, "mean": 0.6376124000325945, "iqr": 0.003270999968663091, "raw_times": [0.6375930000785957, 0.636012000086339, 0.6404419999626043, 0.6347319999804313, 0.639283000055002], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.639422999938688, "peak_bytes": 436731904, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T14:27:29Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H8_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.17006399997399058, "p50": 0.17533400000502297, "p90": 0.1853339999797754, "mean": 0.1802961999942454, "iqr": 0.014799999974002276, "raw_times": [0.17533400000502297, 0.20021500000666492, 0.17053400000577312, 0.1853339999797754, 0.17006399997399058], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.18331500001522727, "peak_bytes": 1720320, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T14:27:29Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H8_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2246159999685915, "p50": 0.2266160000203854, "p90": 0.22888500001272405, "mean": 0.22735560000910482, "iqr": 0.002880000010918593, "raw_times": [0.22600500000180546, 0.22888500001272405, 0.2246159999685915, 0.2266160000203854, 0.2306560000420177], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2471160000254713, "peak_bytes": 3440640, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T14:27:29Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H32_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21813499995460006, "p50": 0.22189599997091136, "p90": 0.2272149999953399, "mean": 0.22315939997952228, "iqr": 0.007960000004914036, "raw_times": [0.2272149999953399, 0.22189599997091136, 0.21925499999042586, 0.21813499995460006, 0.2292959999863342], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2391049999914685, "peak_bytes": 6832128, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T14:27:29Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H32_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21478600001501036, "p50": 0.21544499998071842, "p90": 0.2178249999928994, "mean": 0.2161891999890031, "iqr": 0.0027799999884337012, "raw_times": [0.21544499998071842, 0.2178249999928994, 0.21478600001501036, 0.21784499995192164, 0.2150450000044657], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22814599998355334, "peak_bytes": 13664256, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T14:27:29Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H8_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2160950000416051, "p50": 0.22390499998437008, "p90": 0.22473600000694205, "mean": 0.22559540000202105, "iqr": 0.00507100003233063, "raw_times": [0.22390499998437008, 0.24357600000257662, 0.22473600000694205, 0.2160950000416051, 0.21966499997461142], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.229085999990275, "peak_bytes": 6881280, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H8_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21366499998975996, "p50": 0.21597500000325454, "p90": 0.21670500001391702, "mean": 0.2158129999884295, "iqr": 0.0008600000569458643, "raw_times": [0.21366499998975996, 0.2168749999782449, 0.21597500000325454, 0.21670500001391702, 0.21584499995697115], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21572499997546402, "peak_bytes": 13762560, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H32_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2142449999951168, "p50": 0.21574499999132968, "p90": 0.2169850000086626, "mean": 0.21585539999477987, "iqr": 0.0022990000161371427, "raw_times": [0.2142449999951168, 0.2176159999862648, 0.21468599999252547, 0.2169850000086626, 0.21574499999132968], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2192349999745602, "peak_bytes": 27328512, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H32_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21675499999673775, "p50": 0.21711599998752718, "p90": 0.21833499999956985, "mean": 0.2174776000060774, "iqr": 0.0015789999565640755, "raw_times": [0.21675499999673775, 0.21711599998752718, 0.21833499999956985, 0.21675600004300577, 0.2184260000035465], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22064600000248902, "peak_bytes": 54657024, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H8_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2153649999740992, "p50": 0.21702599997297511, "p90": 0.21829499996783852, "mean": 0.21729759998834197, "iqr": 0.0014989999499448459, "raw_times": [0.2153649999740992, 0.21679600001789368, 0.21900600000890336, 0.21702599997297511, 0.21829499996783852], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22154499998805477, "peak_bytes": 27525120, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H8_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2143060000321384, "p50": 0.2161449999675824, "p90": 0.21640500000330576, "mean": 0.21578740000904872, "iqr": 0.0008589999538344273, "raw_times": [0.21653499999274572, 0.21640500000330576, 0.2143060000321384, 0.2161449999675824, 0.21554600004947133], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23455599995259035, "peak_bytes": 55050240, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H32_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21488499999122723, "p50": 0.21633500000461936, "p90": 0.21918499999173946, "mean": 0.21730919999072285, "iqr": 0.004300000000512227, "raw_times": [0.21488499999122723, 0.21918499999173946, 0.21488499999122723, 0.22125599997480094, 0.21633500000461936], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2185359999771208, "peak_bytes": 109314048, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H32_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2230359999657594, "p50": 0.22526600002947816, "p90": 0.22695600000588456, "mean": 0.22723160000168718, "iqr": 0.0026509999884183344, "raw_times": [0.22526600002947816, 0.2230359999657594, 0.23659499998984757, 0.22430500001746623, 0.22695600000588456], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22456599998577076, "peak_bytes": 218628096, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H8_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21608499997682884, "p50": 0.2175149999743553, "p90": 0.22948600002337116, "mean": 0.2247094000040306, "iqr": 0.012610999988282856, "raw_times": [0.21608499997682884, 0.2168750000350883, 0.22948600002337116, 0.24358600001050945, 0.2175149999743553], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21851499997183055, "peak_bytes": 68698112, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H8_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2162149999662688, "p50": 0.21694499997693129, "p90": 0.2171250000060354, "mean": 0.21706100000074002, "iqr": 0.0003099999617006688, "raw_times": [0.2162149999662688, 0.21694499997693129, 0.2171250000060354, 0.21820500001012988, 0.21681500004433474], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21809500003655558, "peak_bytes": 6848512, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H32_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2136749999976928, "p50": 0.21658500003240988, "p90": 0.21662599999672238, "mean": 0.21621120000645533, "iqr": 0.00066100000140068, "raw_times": [0.2136749999976928, 0.2159649999953217, 0.21658500003240988, 0.21662599999672238, 0.21820500001012988], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2181750000431748, "peak_bytes": 13647872, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H32_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21695499998486412, "p50": 0.21774499998628016, "p90": 0.2285450000272249, "mean": 0.22256720000086716, "iqr": 0.010920000022451859, "raw_times": [0.21774499998628016, 0.21762500000477303, 0.2319660000011936, 0.21695499998486412, 0.2285450000272249], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22269599998026024, "peak_bytes": 27295744, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H8_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21440599999777987, "p50": 0.21785499995985447, "p90": 0.2335159999802272, "mean": 0.2228595999895333, "iqr": 0.01891099998374557, "raw_times": [0.21440599999777987, 0.23391600001332336, 0.21785499995985447, 0.2335159999802272, 0.21460499999648164], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21932499998911226, "peak_bytes": 13697024, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H8_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2133250000042608, "p50": 0.21422499997925115, "p90": 0.21653499999274572, "mean": 0.21708740000576654, "iqr": 0.0029589999712698045, "raw_times": [0.21357600002147592, 0.2133250000042608, 0.21653499999274572, 0.21422499997925115, 0.2277760000310991], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22739600001386862, "peak_bytes": 27394048, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H32_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21597500000325454, "p50": 0.2176859999849512, "p90": 0.21771499996248167, "mean": 0.21758339998996234, "iqr": 0.0013999999737279722, "raw_times": [0.2176859999849512, 0.21771499996248167, 0.22022600001037063, 0.2163149999887537, 0.21597500000325454], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21613599994907418, "peak_bytes": 54591488, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H32_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21577600000455277, "p50": 0.2173749999769825, "p90": 0.21900600000890336, "mean": 0.21836960000882755, "iqr": 0.0018509999790694565, "raw_times": [0.2171550000298339, 0.21577600000455277, 0.2225360000238652, 0.2173749999769825, 0.21900600000890336], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22321599999486352, "peak_bytes": 109182976, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H8_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21538499998996485, "p50": 0.21647599999141676, "p90": 0.21717500004569956, "mean": 0.2167214000110107, "iqr": 0.001030000021273736, "raw_times": [0.21717500004569956, 0.2184260000035465, 0.21538499998996485, 0.21614500002442583, 0.21647599999141676], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21872600001415776, "peak_bytes": 54788096, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H8_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21501500003751062, "p50": 0.2168760000245129, "p90": 0.2187050000088675, "mean": 0.21949320001795058, "iqr": 0.0030500000320898835, "raw_times": [0.21565499997677762, 0.2187050000088675, 0.21501500003751062, 0.23121500004208428, 0.2168760000245129], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22076499999457155, "peak_bytes": 109576192, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H32_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22937599999295344, "p50": 0.23008499999832566, "p90": 0.23144499999716572, "mean": 0.23359140000138723, "iqr": 0.0020100000028833165, "raw_times": [0.23144499999716572, 0.23008499999832566, 0.2294349999942824, 0.22937599999295344, 0.24761600002420892], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23195599999326078, "peak_bytes": 218365952, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H32_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.637245999996594, "p50": 0.6388759999822469, "p90": 0.6389449999915087, "mean": 0.6396317999929124, "iqr": 0.0012190000120426703, "raw_times": [0.6388759999822469, 0.6453660000147465, 0.6389449999915087, 0.637245999996594, 0.637725999979466], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.6364359999793123, "peak_bytes": 436731904, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} diff --git a/rotary/impls/hf_kernels_rotary.html b/rotary/impls/hf_kernels_rotary.html index 2749f9f6b5f352621fbf7d1a4c5db169ca775615..0608b9088d0d84399b39661fd8d9fc01a39dbda5 100644 --- a/rotary/impls/hf_kernels_rotary.html +++ b/rotary/impls/hf_kernels_rotary.html @@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: nv | 0.23s +Cell: nv | 0.20s | Raw @@ -3887,7 +3887,7 @@ Cell: nv | 0.23s
-
Tue Oct 28 14:08:24 2025       
+
Wed Oct 29 14:26:51 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -3896,7 +3896,7 @@ Cell: nv | 0.23s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   29C    P0             90W /  350W |       0MiB /  46068MiB |     24%      Default |
+| N/A   32C    P0             76W /  350W |       0MiB /  46068MiB |     11%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3920,7 +3920,7 @@ Cell: nv | 0.23s
 ▼ output
  ▶ uv-logs
  | 
-Cell: benchmark | 8.05s
+Cell: benchmark | 7.90s
  | 
 
 Raw
@@ -3989,23 +3989,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     452.802us      1907.02%     452.802us     452.802us             1  
-                                      hf_kernels_rotary        12.50%     264.332us        99.65%       2.107ms       2.107ms       0.000us         0.00%      24.960us      24.960us             1  
-                          _rotary_dba7d1e::apply_rotary         2.70%      57.162us         4.91%     103.733us      17.289us      16.928us        71.29%      16.928us       2.821us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.928us        71.29%      16.928us       2.821us             6  
-                                            aten::clone         2.21%      46.761us        79.27%       1.676ms     279.401us       0.000us         0.00%       8.032us       1.339us             6  
-                                            aten::copy_         2.31%      48.833us        74.02%       1.565ms     260.899us       6.816us        28.71%       8.032us       1.339us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.816us        28.71%       6.816us       1.136us             6  
-                                Activity Buffer Request        68.03%       1.439ms        68.03%       1.439ms       1.439ms       1.216us         5.12%       1.216us       1.216us             1  
-                                    aten::empty_strided         3.04%      64.252us         3.04%      64.252us      10.709us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         3.68%      77.892us         3.68%      77.892us      12.982us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.33%      49.309us         2.97%      62.771us       5.231us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.64%      13.462us         0.64%      13.462us       1.122us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         2.20%      46.571us         2.20%      46.571us       7.762us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.35%       7.480us         0.35%       7.480us       7.480us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     433.056us      1833.74%     433.056us     433.056us             1  
+                                      hf_kernels_rotary        12.39%     257.808us        99.67%       2.073ms       2.073ms       0.000us         0.00%      24.832us      24.832us             1  
+                          _rotary_dba7d1e::apply_rotary         2.75%      57.199us         5.11%     106.332us      17.722us      16.960us        71.82%      16.960us       2.827us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.960us        71.82%      16.960us       2.827us             6  
+                                            aten::clone         2.11%      43.871us        79.26%       1.649ms     274.763us       0.000us         0.00%       7.872us       1.312us             6  
+                                            aten::copy_         2.19%      45.572us        74.13%       1.542ms     256.978us       6.656us        28.18%       7.872us       1.312us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.656us        28.18%       6.656us       1.109us             6  
+                                Activity Buffer Request        68.36%       1.422ms        68.36%       1.422ms       1.422ms       1.216us         5.15%       1.216us       1.216us             1  
+                                    aten::empty_strided         3.02%      62.841us         3.02%      62.841us      10.473us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         3.58%      74.452us         3.58%      74.452us      12.409us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.28%      47.469us         2.90%      60.410us       5.034us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.62%      12.941us         0.62%      12.941us       1.078us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.36%      49.133us         2.36%      49.133us       8.189us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.33%       6.850us         0.33%       6.850us       6.850us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.115ms
-Self CUDA time total: 23.744us
+Self CPU time total: 2.080ms
+Self CUDA time total: 23.616us
 
 
 
@@ -4015,23 +4015,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     357.532us      1513.94%     357.532us     357.532us             1  
-                                      hf_kernels_rotary         9.61%     183.785us        99.72%       1.907ms       1.907ms       0.000us         0.00%      24.736us      24.736us             1  
-                          _rotary_dba7d1e::apply_rotary         2.38%      45.511us         4.57%      87.364us      14.561us      16.832us        71.27%      16.832us       2.805us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.832us        71.27%      16.832us       2.805us             6  
-                                            aten::clone         1.27%      24.322us        83.40%       1.595ms     265.794us       0.000us         0.00%       7.904us       1.317us             6  
-                                            aten::copy_         1.98%      37.831us        80.39%       1.537ms     256.202us       6.784us        28.73%       7.904us       1.317us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.784us        28.73%       6.784us       1.131us             6  
-                                Activity Buffer Request        75.51%       1.444ms        75.51%       1.444ms       1.444ms       1.120us         4.74%       1.120us       1.120us             1  
-                                    aten::empty_strided         1.74%      33.230us         1.74%      33.230us       5.538us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         2.90%      55.533us         2.90%      55.533us       9.256us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.68%      32.211us         2.13%      40.791us       3.399us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.45%       8.580us         0.45%       8.580us       0.715us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         2.19%      41.853us         2.19%      41.853us       6.976us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.28%       5.420us         0.28%       5.420us       5.420us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     368.319us      1559.68%     368.319us     368.319us             1  
+                                      hf_kernels_rotary         8.92%     167.782us        99.73%       1.876ms       1.876ms       0.000us         0.00%      24.767us      24.767us             1  
+                          _rotary_dba7d1e::apply_rotary         2.34%      44.032us         4.50%      84.553us      14.092us      16.832us        71.28%      16.832us       2.805us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.832us        71.28%      16.832us       2.805us             6  
+                                            aten::clone         1.16%      21.840us        83.94%       1.579ms     263.113us       0.000us         0.00%       7.935us       1.322us             6  
+                                            aten::copy_         2.86%      53.852us        81.07%       1.525ms     254.111us       6.783us        28.72%       7.935us       1.322us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.783us        28.72%       6.783us       1.130us             6  
+                                Activity Buffer Request        75.10%       1.412ms        75.10%       1.412ms       1.412ms       1.152us         4.88%       1.152us       1.152us             1  
+                                    aten::empty_strided         1.71%      32.171us         1.71%      32.171us       5.362us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         3.11%      58.461us         3.11%      58.461us       9.744us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.82%      34.274us         2.37%      44.512us       3.709us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.54%      10.238us         0.54%      10.238us       0.853us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.15%      40.521us         2.15%      40.521us       6.753us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.27%       5.140us         0.27%       5.140us       5.140us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.912ms
-Self CUDA time total: 23.616us
+Self CPU time total: 1.881ms
+Self CUDA time total: 23.615us
 
 
 
@@ -4041,23 +4041,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     340.570us      1359.24%     340.570us     340.570us             1  
-                                      hf_kernels_rotary         8.83%     169.069us        99.74%       1.910ms       1.910ms       0.000us         0.00%      26.368us      26.368us             1  
-                          _rotary_dba7d1e::apply_rotary         2.33%      44.610us         4.50%      86.120us      14.353us      17.248us        68.84%      17.248us       2.875us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      17.248us        68.84%      17.248us       2.875us             6  
-                                            aten::clone         1.25%      23.991us        84.27%       1.614ms     269.024us       0.000us         0.00%       9.120us       1.520us             6  
-                                            aten::copy_         1.92%      36.791us        81.38%       1.559ms     259.779us       7.808us        31.16%       9.120us       1.520us             6  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     346.939us      1384.60%     346.939us     346.939us             1  
+                                      hf_kernels_rotary         8.57%     160.653us        99.71%       1.870ms       1.870ms       0.000us         0.00%      26.369us      26.369us             1  
+                          _rotary_dba7d1e::apply_rotary         2.32%      43.421us         4.67%      87.601us      14.600us      17.249us        68.84%      17.249us       2.875us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      17.249us        68.84%      17.249us       2.875us             6  
+                                            aten::clone         1.23%      23.032us        84.13%       1.577ms     262.912us       0.000us         0.00%       9.120us       1.520us             6  
+                                            aten::copy_         1.94%      36.311us        81.17%       1.522ms     253.669us       7.808us        31.16%       9.120us       1.520us             6  
                          Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.808us        31.16%       7.808us       1.301us             6  
-                                Activity Buffer Request        76.60%       1.467ms        76.60%       1.467ms       1.467ms       1.312us         5.24%       1.312us       1.312us             1  
-                                    aten::empty_strided         1.64%      31.482us         1.64%      31.482us       5.247us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         2.85%      54.600us         2.85%      54.600us       9.100us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.69%      32.440us         2.15%      41.092us       3.424us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.45%       8.652us         0.45%       8.652us       0.721us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         2.17%      41.510us         2.17%      41.510us       6.918us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.26%       4.990us         0.26%       4.990us       4.990us       0.000us         0.00%       0.000us       0.000us             1  
+                                Activity Buffer Request        76.42%       1.433ms        76.42%       1.433ms       1.433ms       1.312us         5.24%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.73%      32.420us         1.73%      32.420us       5.403us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         2.81%      52.730us         2.81%      52.730us       8.788us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.83%      34.233us         2.34%      43.964us       3.664us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.52%       9.731us         0.52%       9.731us       0.811us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.36%      44.180us         2.36%      44.180us       7.363us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.29%       5.410us         0.29%       5.410us       5.410us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.915ms
-Self CUDA time total: 25.056us
+Self CPU time total: 1.875ms
+Self CUDA time total: 25.057us
 
 
 
@@ -4067,23 +4067,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     346.075us      1340.08%     346.075us     346.075us             1  
-                                      hf_kernels_rotary         7.97%     168.270us        99.76%       2.107ms       2.107ms       0.000us         0.00%      27.137us      27.137us             1  
-                          _rotary_dba7d1e::apply_rotary         2.16%      45.651us         4.14%      87.411us      14.569us      18.049us        69.89%      18.049us       3.008us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      18.049us        69.89%      18.049us       3.008us             6  
-                                            aten::clone         1.15%      24.271us        85.69%       1.810ms     301.630us       0.000us         0.00%       9.088us       1.515us             6  
-                                            aten::copy_         1.78%      37.581us        83.02%       1.753ms     292.225us       7.776us        30.11%       9.088us       1.515us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.776us        30.11%       7.776us       1.296us             6  
-                                Activity Buffer Request        68.60%       1.449ms        68.60%       1.449ms       1.449ms       1.312us         5.08%       1.312us       1.312us             1  
-                                    aten::empty_strided         1.52%      32.162us         1.52%      32.162us       5.360us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        12.64%     267.018us        12.64%     267.018us      44.503us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.55%      32.701us         1.96%      41.360us       3.447us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.41%       8.659us         0.41%       8.659us       0.722us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         1.98%      41.760us         1.98%      41.760us       6.960us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.24%       5.141us         0.24%       5.141us       5.141us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     347.904us      1355.61%     347.904us     347.904us             1  
+                                      hf_kernels_rotary         7.92%     162.592us        99.76%       2.047ms       2.047ms       0.000us         0.00%      27.009us      27.009us             1  
+                          _rotary_dba7d1e::apply_rotary         2.09%      42.932us         4.15%      85.134us      14.189us      17.951us        69.95%      17.951us       2.992us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      17.951us        69.95%      17.951us       2.992us             6  
+                                            aten::clone         1.22%      25.009us        85.61%       1.757ms     292.750us       0.000us         0.00%       9.058us       1.510us             6  
+                                            aten::copy_         1.81%      37.091us        82.80%       1.699ms     283.112us       7.713us        30.05%       9.058us       1.510us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.713us        30.05%       7.713us       1.285us             6  
+                                Activity Buffer Request        69.84%       1.433ms        69.84%       1.433ms       1.433ms       1.345us         5.24%       1.345us       1.345us             1  
+                                    aten::empty_strided         1.60%      32.820us         1.60%      32.820us       5.470us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        11.14%     228.627us        11.14%     228.627us      38.104us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.59%      32.701us         2.07%      42.551us       3.546us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.48%       9.850us         0.48%       9.850us       0.821us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.06%      42.202us         2.06%      42.202us       7.034us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.24%       4.861us         0.24%       4.861us       4.861us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.112ms
-Self CUDA time total: 25.825us
+Self CPU time total: 2.052ms
+Self CUDA time total: 25.664us
 
 
 
@@ -4093,23 +4093,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     383.355us      1524.21%     383.355us     383.355us             1  
-                                      hf_kernels_rotary         8.48%     177.428us        99.77%       2.088ms       2.088ms       0.000us         0.00%      26.495us      26.495us             1  
-                          _rotary_dba7d1e::apply_rotary         3.05%      63.861us         5.13%     107.442us      17.907us      17.215us        68.45%      17.215us       2.869us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      17.215us        68.45%      17.215us       2.869us             6  
-                                            aten::clone         1.13%      23.688us        84.02%       1.758ms     293.025us       0.000us         0.00%       9.280us       1.547us             6  
-                                            aten::copy_         1.90%      39.711us        81.30%       1.701ms     283.530us       7.936us        31.55%       9.280us       1.547us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.936us        31.55%       7.936us       1.323us             6  
-                                Activity Buffer Request        67.53%       1.413ms        67.53%       1.413ms       1.413ms       1.344us         5.34%       1.344us       1.344us             1  
-                                    aten::empty_strided         1.59%      33.283us         1.59%      33.283us       5.547us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        11.87%     248.348us        11.87%     248.348us      41.391us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.70%      35.532us         2.14%      44.714us       3.726us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.44%       9.182us         0.44%       9.182us       0.765us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         2.08%      43.581us         2.08%      43.581us       7.264us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.23%       4.831us         0.23%       4.831us       4.831us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     356.192us      1425.17%     356.192us     356.192us             1  
+                                      hf_kernels_rotary         9.03%     181.778us        99.74%       2.009ms       2.009ms       0.000us         0.00%      26.306us      26.306us             1  
+                          _rotary_dba7d1e::apply_rotary         2.18%      43.970us         4.25%      85.660us      14.277us      17.088us        68.37%      17.088us       2.848us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      17.088us        68.37%      17.088us       2.848us             6  
+                                            aten::clone         1.16%      23.451us        84.31%       1.698ms     283.035us       0.000us         0.00%       9.218us       1.536us             6  
+                                            aten::copy_         1.79%      36.151us        81.55%       1.643ms     273.753us       7.905us        31.63%       9.218us       1.536us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.905us        31.63%       7.905us       1.318us             6  
+                                Activity Buffer Request        70.14%       1.413ms        70.14%       1.413ms       1.413ms       1.313us         5.25%       1.313us       1.313us             1  
+                                    aten::empty_strided         1.60%      32.242us         1.60%      32.242us       5.374us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         9.61%     193.593us         9.61%     193.593us      32.266us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.67%      33.621us         2.15%      43.371us       3.614us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.48%       9.750us         0.48%       9.750us       0.812us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.07%      41.690us         2.07%      41.690us       6.948us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.26%       5.140us         0.26%       5.140us       5.140us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.093ms
-Self CUDA time total: 25.151us
+Self CPU time total: 2.014ms
+Self CUDA time total: 24.993us
 
 
 
@@ -4119,23 +4119,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     348.288us      1348.70%     348.288us     348.288us             1  
-                                      hf_kernels_rotary         8.04%     167.026us        99.77%       2.072ms       2.072ms       0.000us         0.00%      27.136us      27.136us             1  
-                          _rotary_dba7d1e::apply_rotary         2.17%      45.031us         4.15%      86.212us      14.369us      18.016us        69.76%      18.016us       3.003us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      18.016us        69.76%      18.016us       3.003us             6  
-                                            aten::clone         1.23%      25.613us        85.56%       1.777ms     296.124us       0.000us         0.00%       9.120us       1.520us             6  
-                                            aten::copy_         1.80%      37.380us        82.71%       1.718ms     286.270us       7.808us        30.24%       9.120us       1.520us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.808us        30.24%       7.808us       1.301us             6  
-                                Activity Buffer Request        69.08%       1.434ms        69.08%       1.434ms       1.434ms       1.312us         5.08%       1.312us       1.312us             1  
-                                    aten::empty_strided         1.61%      33.511us         1.61%      33.511us       5.585us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        11.83%     245.758us        11.83%     245.758us      40.960us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.59%      33.022us         2.01%      41.843us       3.487us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.42%       8.821us         0.42%       8.821us       0.735us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         1.98%      41.181us         1.98%      41.181us       6.863us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.23%       4.770us         0.23%       4.770us       4.770us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     345.469us      1341.21%     345.469us     345.469us             1  
+                                      hf_kernels_rotary         8.14%     161.605us        99.74%       1.979ms       1.979ms       0.000us         0.00%      27.070us      27.070us             1  
+                          _rotary_dba7d1e::apply_rotary         2.10%      41.690us         4.19%      83.112us      13.852us      17.982us        69.81%      17.982us       2.997us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      17.982us        69.81%      17.982us       2.997us             6  
+                                            aten::clone         1.15%      22.842us        85.12%       1.689ms     281.515us       0.000us         0.00%       9.088us       1.515us             6  
+                                            aten::copy_         1.84%      36.466us        82.36%       1.634ms     272.405us       7.776us        30.19%       9.088us       1.515us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.776us        30.19%       7.776us       1.296us             6  
+                                Activity Buffer Request        71.40%       1.417ms        71.40%       1.417ms       1.417ms       1.312us         5.09%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.60%      31.821us         1.60%      31.821us       5.303us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         9.12%     181.057us         9.12%     181.057us      30.176us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.80%      35.740us         2.29%      45.520us       3.793us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.49%       9.780us         0.49%       9.780us       0.815us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.09%      41.422us         2.09%      41.422us       6.904us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.26%       5.151us         0.26%       5.151us       5.151us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.077ms
-Self CUDA time total: 25.824us
+Self CPU time total: 1.984ms
+Self CUDA time total: 25.758us
 
 
 
@@ -4145,23 +4145,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     342.589us      1056.85%     342.589us     342.589us             1  
-                                      hf_kernels_rotary         8.06%     166.005us        99.77%       2.055ms       2.055ms       0.000us         0.00%      34.208us      34.208us             1  
-                          _rotary_dba7d1e::apply_rotary         2.10%      43.163us         4.03%      82.914us      13.819us      21.856us        67.42%      21.856us       3.643us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      21.856us        67.42%      21.856us       3.643us             6  
-                                            aten::clone         1.18%      24.311us        85.73%       1.766ms     294.310us       0.000us         0.00%      12.352us       2.059us             6  
-                                            aten::copy_         1.85%      38.151us        82.92%       1.708ms     284.677us      10.560us        32.58%      12.352us       2.059us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.560us        32.58%      10.560us       1.760us             6  
-                                Activity Buffer Request        69.37%       1.429ms        69.37%       1.429ms       1.429ms       1.792us         5.53%       1.792us       1.792us             1  
-                                    aten::empty_strided         1.63%      33.490us         1.63%      33.490us       5.582us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        11.70%     241.040us        11.70%     241.040us      40.173us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.54%      31.672us         1.96%      40.421us       3.368us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.42%       8.749us         0.42%       8.749us       0.729us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         1.93%      39.751us         1.93%      39.751us       6.625us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.23%       4.681us         0.23%       4.681us       4.681us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     370.847us      1148.52%     370.847us     370.847us             1  
+                                      hf_kernels_rotary         8.48%     171.185us        99.77%       2.015ms       2.015ms       0.000us         0.00%      34.081us      34.081us             1  
+                          _rotary_dba7d1e::apply_rotary         2.32%      46.763us         4.49%      90.723us      15.120us      21.793us        67.49%      21.793us       3.632us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      21.793us        67.49%      21.793us       3.632us             6  
+                                            aten::clone         1.25%      25.309us        84.59%       1.708ms     284.718us       0.000us         0.00%      12.288us       2.048us             6  
+                                            aten::copy_         1.96%      39.631us        81.62%       1.648ms     274.723us      10.496us        32.51%      12.288us       2.048us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.496us        32.51%      10.496us       1.749us             6  
+                                Activity Buffer Request        70.18%       1.417ms        70.18%       1.417ms       1.417ms       1.792us         5.55%       1.792us       1.792us             1  
+                                    aten::empty_strided         1.72%      34.661us         1.72%      34.661us       5.777us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         9.48%     191.424us         9.48%     191.424us      31.904us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.73%      34.932us         2.22%      44.771us       3.731us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.49%       9.839us         0.49%       9.839us       0.820us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.18%      43.960us         2.18%      43.960us       7.327us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.23%       4.601us         0.23%       4.601us       4.601us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.060ms
-Self CUDA time total: 32.416us
+Self CPU time total: 2.020ms
+Self CUDA time total: 32.289us
 
 
 
@@ -4171,23 +4171,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     349.021us       674.53%     349.021us     349.021us             1  
-                                      hf_kernels_rotary         8.13%     167.188us        99.77%       2.053ms       2.053ms       0.000us         0.00%      54.656us      54.656us             1  
-                          _rotary_dba7d1e::apply_rotary         2.05%      42.101us         4.09%      84.171us      14.029us      34.590us        66.85%      34.590us       5.765us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      34.590us        66.85%      34.590us       5.765us             6  
-                                            aten::clone         1.20%      24.743us        85.45%       1.758ms     292.975us       0.000us         0.00%      20.066us       3.344us             6  
-                                            aten::copy_         1.77%      36.360us        82.61%       1.700ms     283.256us      17.153us        33.15%      20.066us       3.344us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.153us        33.15%      17.153us       2.859us             6  
-                                Activity Buffer Request        69.27%       1.425ms        69.27%       1.425ms       1.425ms       2.913us         5.63%       2.913us       2.913us             1  
-                                    aten::empty_strided         1.63%      33.571us         1.63%      33.571us       5.595us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        11.58%     238.157us        11.58%     238.157us      39.693us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.68%      34.499us         2.11%      43.362us       3.614us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.43%       8.863us         0.43%       8.863us       0.739us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         2.04%      42.070us         2.04%      42.070us       7.012us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.23%       4.701us         0.23%       4.701us       4.701us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     345.533us       668.21%     345.533us     345.533us             1  
+                                      hf_kernels_rotary         8.13%     161.677us        99.76%       1.983ms       1.983ms       0.000us         0.00%      54.558us      54.558us             1  
+                          _rotary_dba7d1e::apply_rotary         2.15%      42.810us         4.29%      85.240us      14.207us      34.782us        67.26%      34.782us       5.797us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      34.782us        67.26%      34.782us       5.797us             6  
+                                            aten::clone         1.16%      23.089us        85.02%       1.690ms     281.665us       0.000us         0.00%      19.776us       3.296us             6  
+                                            aten::copy_         1.78%      35.482us        82.32%       1.636ms     272.722us      16.928us        32.74%      19.776us       3.296us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.928us        32.74%      16.928us       2.821us             6  
+                                Activity Buffer Request        71.53%       1.422ms        71.53%       1.422ms       1.422ms       2.848us         5.51%       2.848us       2.848us             1  
+                                    aten::empty_strided         1.54%      30.571us         1.54%      30.571us       5.095us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         9.00%     178.904us         9.00%     178.904us      29.817us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.84%      36.581us         2.32%      46.051us       3.838us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.48%       9.470us         0.48%       9.470us       0.789us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.13%      42.430us         2.13%      42.430us       7.072us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.24%       4.870us         0.24%       4.870us       4.870us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.057ms
-Self CUDA time total: 51.743us
+Self CPU time total: 1.988ms
+Self CUDA time total: 51.710us
 
 
 
@@ -4197,23 +4197,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     342.845us      1058.69%     342.845us     342.845us             1  
-                                      hf_kernels_rotary         7.95%     162.638us        99.78%       2.041ms       2.041ms       0.000us         0.00%      34.176us      34.176us             1  
-                          _rotary_dba7d1e::apply_rotary         2.08%      42.501us         4.07%      83.221us      13.870us      21.760us        67.19%      21.760us       3.627us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      21.760us        67.19%      21.760us       3.627us             6  
-                                            aten::clone         1.16%      23.762us        85.72%       1.754ms     292.258us       0.000us         0.00%      12.416us       2.069us             6  
-                                            aten::copy_         1.82%      37.190us        83.02%       1.698ms     283.036us      10.624us        32.81%      12.416us       2.069us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.624us        32.81%      10.624us       1.771us             6  
-                                Activity Buffer Request        69.60%       1.424ms        69.60%       1.424ms       1.424ms       1.792us         5.53%       1.792us       1.792us             1  
-                                    aten::empty_strided         1.54%      31.570us         1.54%      31.570us       5.262us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        11.60%     237.247us        11.60%     237.247us      39.541us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.62%      33.195us         2.03%      41.584us       3.465us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.41%       8.389us         0.41%       8.389us       0.699us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         1.99%      40.720us         1.99%      40.720us       6.787us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.22%       4.600us         0.22%       4.600us       4.600us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     338.136us      1047.28%     338.136us     338.136us             1  
+                                      hf_kernels_rotary        19.11%     157.801us        99.43%     820.869us     820.869us       0.000us         0.00%      34.078us      34.078us             1  
+                          _rotary_dba7d1e::apply_rotary         5.12%      42.269us        10.18%      84.080us      14.013us      21.792us        67.49%      21.792us       3.632us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      21.792us        67.49%      21.792us       3.632us             6  
+                                            aten::clone         2.56%      21.133us        65.13%     537.684us      89.614us       0.000us         0.00%      12.286us       2.048us             6  
+                                            aten::copy_         4.56%      37.650us        58.77%     485.172us      80.862us      10.495us        32.51%      12.286us       2.048us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.495us        32.51%      10.495us       1.749us             6  
+                                Activity Buffer Request        32.51%     268.347us        32.51%     268.347us     268.347us       1.791us         5.55%       1.791us       1.791us             1  
+                                    aten::empty_strided         3.80%      31.379us         3.80%      31.379us       5.230us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        21.70%     179.175us        21.70%     179.175us      29.862us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.93%      32.405us         5.00%      41.304us       3.442us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.08%       8.899us         1.08%       8.899us       0.742us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.06%      41.811us         5.06%      41.811us       6.969us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.57%       4.680us         0.57%       4.680us       4.680us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.046ms
-Self CUDA time total: 32.384us
+Self CPU time total: 825.549us
+Self CUDA time total: 32.287us
 
 
 
@@ -4223,23 +4223,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     345.276us       667.68%     345.276us     345.276us             1  
-                                      hf_kernels_rotary        17.87%     159.778us        99.47%     889.262us     889.262us       0.000us         0.00%      54.593us      54.593us             1  
-                          _rotary_dba7d1e::apply_rotary         4.83%      43.201us         9.55%      85.402us      14.234us      34.656us        67.02%      34.656us       5.776us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      34.656us        67.02%      34.656us       5.776us             6  
-                                            aten::clone         2.69%      24.052us        67.57%     604.071us     100.678us       0.000us         0.00%      19.937us       3.323us             6  
-                                            aten::copy_         3.98%      35.591us        61.32%     548.169us      91.362us      17.057us        32.98%      19.937us       3.323us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.057us        32.98%      17.057us       2.843us             6  
-                                Activity Buffer Request        31.28%     279.600us        31.28%     279.600us     279.600us       2.880us         5.57%       2.880us       2.880us             1  
-                                    aten::empty_strided         3.56%      31.850us         3.56%      31.850us       5.308us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        26.06%     232.978us        26.06%     232.978us      38.830us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.51%      31.369us         4.48%      40.011us       3.334us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.97%       8.642us         0.97%       8.642us       0.720us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         4.72%      42.201us         4.72%      42.201us       7.034us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.53%       4.740us         0.53%       4.740us       4.740us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     347.832us       672.66%     347.832us     347.832us             1  
+                                      hf_kernels_rotary        18.98%     156.996us        99.42%     822.501us     822.501us       0.000us         0.00%      54.558us      54.558us             1  
+                          _rotary_dba7d1e::apply_rotary         5.15%      42.621us        10.22%      84.512us      14.085us      34.783us        67.27%      34.783us       5.797us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      34.783us        67.27%      34.783us       5.797us             6  
+                                            aten::clone         2.65%      21.930us        64.92%     537.102us      89.517us       0.000us         0.00%      19.775us       3.296us             6  
+                                            aten::copy_         4.53%      37.450us        58.33%     482.542us      80.424us      16.927us        32.73%      19.775us       3.296us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.927us        32.73%      16.927us       2.821us             6  
+                                Activity Buffer Request        32.06%     265.247us        32.06%     265.247us     265.247us       2.848us         5.51%       2.848us       2.848us             1  
+                                    aten::empty_strided         3.94%      32.630us         3.94%      32.630us       5.438us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        21.74%     179.845us        21.74%     179.845us      29.974us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.14%      34.239us         5.31%      43.891us       3.658us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.17%       9.652us         1.17%       9.652us       0.804us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.06%      41.891us         5.06%      41.891us       6.982us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.58%       4.770us         0.58%       4.770us       4.770us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 894.002us
-Self CUDA time total: 51.713us
+Self CPU time total: 827.271us
+Self CUDA time total: 51.710us
 
 
 
@@ -4249,23 +4249,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     372.345us       343.04%     372.345us     372.345us             1  
-                                      hf_kernels_rotary        19.45%     178.278us        99.48%     911.643us     911.643us       0.000us         0.00%     126.592us     126.592us             1  
-                                            aten::clone         2.39%      21.900us        65.33%     598.671us      99.778us       0.000us         0.00%      69.792us      11.632us             6  
-                                            aten::copy_         4.20%      38.503us        59.48%     545.071us      90.845us      51.744us        47.67%      69.792us      11.632us             6  
-                          _rotary_dba7d1e::apply_rotary         5.03%      46.070us         9.81%      89.853us      14.975us      56.800us        52.33%      56.800us       9.467us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      56.800us        52.33%      56.800us       9.467us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      51.744us        47.67%      51.744us       8.624us             6  
-                                Activity Buffer Request        29.76%     272.689us        29.76%     272.689us     272.689us      18.048us        16.63%      18.048us      18.048us             1  
-                                    aten::empty_strided         3.46%      31.700us         3.46%      31.700us       5.283us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        25.52%     233.879us        25.52%     233.879us      38.980us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.90%      35.730us         4.89%      44.841us       3.737us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.99%       9.111us         0.99%       9.111us       0.759us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         4.78%      43.783us         4.78%      43.783us       7.297us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.52%       4.730us         0.52%       4.730us       4.730us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     352.413us       323.34%     352.413us     352.413us             1  
+                                      hf_kernels_rotary        18.38%     152.793us        99.44%     826.801us     826.801us       0.000us         0.00%     127.423us     127.423us             1  
+                                            aten::clone         2.64%      21.959us        64.91%     539.754us      89.959us       0.000us         0.00%      69.984us      11.664us             6  
+                                            aten::copy_         4.48%      37.251us        58.50%     486.434us      81.072us      51.552us        47.30%      69.984us      11.664us             6  
+                          _rotary_dba7d1e::apply_rotary         5.35%      44.522us        10.55%      87.704us      14.617us      57.439us        52.70%      57.439us       9.573us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      57.439us        52.70%      57.439us       9.573us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      51.552us        47.30%      51.552us       8.592us             6  
+                                Activity Buffer Request        32.52%     270.437us        32.52%     270.437us     270.437us      18.432us        16.91%      18.432us      18.432us             1  
+                                    aten::empty_strided         3.77%      31.361us         3.77%      31.361us       5.227us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        21.50%     178.746us        21.50%     178.746us      29.791us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.45%      36.960us         5.60%      46.550us       3.879us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.15%       9.590us         1.15%       9.590us       0.799us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.19%      43.182us         5.19%      43.182us       7.197us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.56%       4.690us         0.56%       4.690us       4.690us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 916.373us
-Self CUDA time total: 108.544us
+Self CPU time total: 831.491us
+Self CUDA time total: 108.991us
 
 
 
@@ -4275,23 +4275,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     373.881us       208.27%     373.881us     373.881us             1  
-                                      hf_kernels_rotary        17.56%     156.837us        99.52%     888.752us     888.752us       0.000us         0.00%     203.231us     203.231us             1  
-                                            aten::clone         2.51%      22.450us        65.45%     584.500us      97.417us       0.000us         0.00%     102.431us      17.072us             6  
-                                            aten::copy_         4.24%      37.839us        59.27%     529.299us      88.217us      78.719us        43.85%     102.431us      17.072us             6  
-                          _rotary_dba7d1e::apply_rotary         4.89%      43.682us        11.68%     104.316us      17.386us     100.800us        56.15%     100.800us      16.800us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us     100.800us        56.15%     100.800us      16.800us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      78.719us        43.85%      78.719us      13.120us             6  
-                                Activity Buffer Request        29.56%     264.020us        29.56%     264.020us     264.020us      23.712us        13.21%      23.712us      23.712us             1  
-                                    aten::empty_strided         3.67%      32.751us         3.67%      32.751us       5.458us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        25.47%     227.440us        25.47%     227.440us      37.907us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.79%      33.838us         4.83%      43.099us       3.592us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.04%       9.261us         1.04%       9.261us       0.772us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         6.79%      60.634us         6.79%      60.634us      10.106us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.48%       4.320us         0.48%       4.320us       4.320us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     354.429us       196.77%     354.429us     354.429us             1  
+                                      hf_kernels_rotary        18.96%     156.272us        99.48%     819.980us     819.980us       0.000us         0.00%     203.900us     203.900us             1  
+                                            aten::clone         2.73%      22.479us        64.84%     534.473us      89.079us       0.000us         0.00%     102.557us      17.093us             6  
+                                            aten::copy_         4.31%      35.551us        58.35%     480.933us      80.156us      78.782us        43.74%     102.557us      17.093us             6  
+                          _rotary_dba7d1e::apply_rotary         5.14%      42.393us        10.35%      85.274us      14.212us     101.343us        56.26%     101.343us      16.890us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us     101.343us        56.26%     101.343us      16.890us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      78.782us        43.74%      78.782us      13.130us             6  
+                                Activity Buffer Request        32.52%     268.027us        32.52%     268.027us     268.027us      23.775us        13.20%      23.775us      23.775us             1  
+                                    aten::empty_strided         3.77%      31.061us         3.77%      31.061us       5.177us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        21.52%     177.355us        21.52%     177.355us      29.559us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.12%      33.982us         5.33%      43.961us       3.663us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.21%       9.979us         1.21%       9.979us       0.832us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.20%      42.881us         5.20%      42.881us       7.147us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.52%       4.300us         0.52%       4.300us       4.300us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 893.072us
-Self CUDA time total: 179.519us
+Self CPU time total: 824.280us
+Self CUDA time total: 180.125us
 
 
 
@@ -4301,23 +4301,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     339.068us      1293.81%     339.068us     339.068us             1  
-                                      hf_kernels_rotary        18.21%     158.266us        99.46%     864.691us     864.691us       0.000us         0.00%      27.359us      27.359us             1  
-                          _rotary_dba7d1e::apply_rotary         4.98%      43.284us         9.71%      84.425us      14.071us      19.391us        73.99%      19.391us       3.232us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      19.391us        73.99%      19.391us       3.232us             6  
-                                            aten::clone         2.67%      23.179us        66.79%     580.620us      96.770us       0.000us         0.00%       7.968us       1.328us             6  
-                                            aten::copy_         4.38%      38.042us        60.58%     526.630us      87.772us       6.816us        26.01%       7.968us       1.328us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.816us        26.01%       6.816us       1.136us             6  
-                                Activity Buffer Request        29.98%     260.620us        29.98%     260.620us     260.620us       1.152us         4.40%       1.152us       1.152us             1  
-                                    aten::empty_strided         3.54%      30.811us         3.54%      30.811us       5.135us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        26.22%     227.968us        26.22%     227.968us      37.995us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.77%      32.731us         4.76%      41.380us       3.448us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.99%       8.649us         0.99%       8.649us       0.721us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         4.73%      41.141us         4.73%      41.141us       6.857us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.54%       4.651us         0.54%       4.651us       4.651us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     338.587us      1293.50%     338.587us     338.587us             1  
+                                      hf_kernels_rotary        19.34%     157.366us        99.42%     808.960us     808.960us       0.000us         0.00%      27.296us      27.296us             1  
+                          _rotary_dba7d1e::apply_rotary         5.26%      42.761us        10.55%      85.842us      14.307us      19.392us        74.08%      19.392us       3.232us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      19.392us        74.08%      19.392us       3.232us             6  
+                                            aten::clone         2.60%      21.121us        64.41%     524.052us      87.342us       0.000us         0.00%       7.904us       1.317us             6  
+                                            aten::copy_         4.60%      37.442us        58.06%     472.441us      78.740us       6.784us        25.92%       7.904us       1.317us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.784us        25.92%       6.784us       1.131us             6  
+                                Activity Buffer Request        31.61%     257.196us        31.61%     257.196us     257.196us       1.120us         4.28%       1.120us       1.120us             1  
+                                    aten::empty_strided         3.75%      30.490us         3.75%      30.490us       5.082us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        21.85%     177.803us        21.85%     177.803us      29.634us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.95%      32.140us         5.12%      41.700us       3.475us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.17%       9.560us         1.17%       9.560us       0.797us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.29%      43.081us         5.29%      43.081us       7.180us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.58%       4.711us         0.58%       4.711us       4.711us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 869.342us
-Self CUDA time total: 26.207us
+Self CPU time total: 813.671us
+Self CUDA time total: 26.176us
 
 
 
@@ -4327,23 +4327,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     345.689us      1259.02%     345.689us     345.689us             1  
-                                      hf_kernels_rotary        18.17%     159.455us        99.46%     872.870us     872.870us       0.000us         0.00%      28.769us      28.769us             1  
-                          _rotary_dba7d1e::apply_rotary         4.92%      43.180us         9.80%      85.973us      14.329us      19.616us        71.44%      19.616us       3.269us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      19.616us        71.44%      19.616us       3.269us             6  
-                                            aten::clone         2.64%      23.140us        66.83%     586.460us      97.743us       0.000us         0.00%       9.153us       1.526us             6  
-                                            aten::copy_         4.27%      37.430us        60.39%     529.960us      88.327us       7.841us        28.56%       9.153us       1.526us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.841us        28.56%       7.841us       1.307us             6  
-                                Activity Buffer Request        29.89%     262.350us        29.89%     262.350us     262.350us       1.312us         4.78%       1.312us       1.312us             1  
-                                    aten::empty_strided         3.80%      33.360us         3.80%      33.360us       5.560us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        26.23%     230.180us        26.23%     230.180us      38.363us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.66%      32.161us         4.67%      40.982us       3.415us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.01%       8.821us         1.01%       8.821us       0.735us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         4.88%      42.793us         4.88%      42.793us       7.132us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.54%       4.730us         0.54%       4.730us       4.730us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     349.862us      1278.50%     349.862us     349.862us             1  
+                                      hf_kernels_rotary        19.32%     156.134us        99.42%     803.460us     803.460us       0.000us         0.00%      28.709us      28.709us             1  
+                          _rotary_dba7d1e::apply_rotary         5.33%      43.099us        10.84%      87.643us      14.607us      19.428us        71.00%      19.428us       3.238us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      19.428us        71.00%      19.428us       3.238us             6  
+                                            aten::clone         2.80%      22.600us        63.71%     514.893us      85.816us       0.000us         0.00%       9.281us       1.547us             6  
+                                            aten::copy_         4.89%      39.481us        56.99%     460.582us      76.764us       7.937us        29.00%       9.281us       1.547us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.937us        29.00%       7.937us       1.323us             6  
+                                Activity Buffer Request        27.85%     225.076us        27.85%     225.076us     225.076us       1.344us         4.91%       1.344us       1.344us             1  
+                                    aten::empty_strided         3.92%      31.711us         3.92%      31.711us       5.285us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        24.26%     196.025us        24.26%     196.025us      32.671us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.38%      35.400us         5.54%      44.790us       3.732us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.16%       9.390us         1.16%       9.390us       0.782us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.51%      44.544us         5.51%      44.544us       7.424us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.58%       4.720us         0.58%       4.720us       4.720us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 877.600us
-Self CUDA time total: 27.457us
+Self CPU time total: 808.180us
+Self CUDA time total: 27.365us
 
 
 
@@ -4353,23 +4353,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     352.280us      1238.42%     352.280us     352.280us             1  
-                                      hf_kernels_rotary        18.63%     163.526us        99.48%     873.041us     873.041us       0.000us         0.00%      29.790us      29.790us             1  
-                          _rotary_dba7d1e::apply_rotary         4.98%      43.742us         9.85%      86.414us      14.402us      20.606us        72.44%      20.606us       3.434us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      20.606us        72.44%      20.606us       3.434us             6  
-                                            aten::clone         2.59%      22.720us        66.23%     581.279us      96.880us       0.000us         0.00%       9.184us       1.531us             6  
-                                            aten::copy_         4.14%      36.351us        59.98%     526.379us      87.730us       7.840us        27.56%       9.184us       1.531us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.840us        27.56%       7.840us       1.307us             6  
-                                Activity Buffer Request        30.03%     263.549us        30.03%     263.549us     263.549us       1.344us         4.72%       1.344us       1.344us             1  
-                                    aten::empty_strided         3.67%      32.180us         3.67%      32.180us       5.363us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        25.81%     226.479us        25.81%     226.479us      37.747us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.76%      33.033us         4.77%      41.822us       3.485us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.00%       8.789us         1.00%       8.789us       0.732us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         4.86%      42.672us         4.86%      42.672us       7.112us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.52%       4.560us         0.52%       4.560us       4.560us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     349.981us      1235.85%     349.981us     349.981us             1  
+                                      hf_kernels_rotary         8.03%     161.215us        99.76%       2.003ms       2.003ms       0.000us         0.00%      29.663us      29.663us             1  
+                          _rotary_dba7d1e::apply_rotary         2.11%      42.422us         4.23%      84.982us      14.164us      20.544us        72.54%      20.544us       3.424us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      20.544us        72.54%      20.544us       3.424us             6  
+                                            aten::clone         1.12%      22.572us        85.29%       1.712ms     285.349us       0.000us         0.00%       9.119us       1.520us             6  
+                                            aten::copy_         1.91%      38.260us        82.54%       1.657ms     276.143us       7.775us        27.46%       9.119us       1.520us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.775us        27.46%       7.775us       1.296us             6  
+                                Activity Buffer Request        71.67%       1.439ms        71.67%       1.439ms       1.439ms       1.344us         4.75%       1.344us       1.344us             1  
+                                    aten::empty_strided         1.63%      32.660us         1.63%      32.660us       5.443us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.96%     179.936us         8.96%     179.936us      29.989us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.74%      34.910us         2.20%      44.250us       3.688us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.47%       9.340us         0.47%       9.340us       0.778us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.12%      42.560us         2.12%      42.560us       7.093us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.24%       4.741us         0.24%       4.741us       4.741us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 877.601us
-Self CUDA time total: 28.446us
+Self CPU time total: 2.007ms
+Self CUDA time total: 28.319us
 
 
 
@@ -4379,23 +4379,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     341.881us       953.86%     341.881us     341.881us             1  
-                                      hf_kernels_rotary        17.61%     155.956us        99.45%     880.921us     880.921us       0.000us         0.00%      37.634us      37.634us             1  
-                          _rotary_dba7d1e::apply_rotary         4.86%      43.060us         9.73%      86.184us      14.364us      25.312us        70.62%      25.312us       4.219us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      25.312us        70.62%      25.312us       4.219us             6  
-                                            aten::clone         2.52%      22.319us        67.43%     597.290us      99.548us       0.000us         0.00%      12.322us       2.054us             6  
-                                            aten::copy_         4.12%      36.502us        61.34%     543.331us      90.555us      10.530us        29.38%      12.322us       2.054us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.530us        29.38%      10.530us       1.755us             6  
-                                Activity Buffer Request        31.67%     280.550us        31.67%     280.550us     280.550us       1.792us         5.00%       1.792us       1.792us             1  
-                                    aten::empty_strided         3.57%      31.640us         3.57%      31.640us       5.273us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        25.54%     226.279us        25.54%     226.279us      37.713us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.70%      32.812us         4.68%      41.491us       3.458us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.98%       8.679us         0.98%       8.679us       0.723us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         4.87%      43.124us         4.87%      43.124us       7.187us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.55%       4.910us         0.55%       4.910us       4.910us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     346.238us       971.27%     346.238us     346.238us             1  
+                                      hf_kernels_rotary         8.04%     160.124us        99.76%       1.988ms       1.988ms       0.000us         0.00%      37.440us      37.440us             1  
+                          _rotary_dba7d1e::apply_rotary         2.20%      43.921us         4.24%      84.493us      14.082us      25.216us        70.74%      25.216us       4.203us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      25.216us        70.74%      25.216us       4.203us             6  
+                                            aten::clone         1.14%      22.762us        85.30%       1.700ms     283.325us       0.000us         0.00%      12.224us       2.037us             6  
+                                            aten::copy_         1.84%      36.620us        82.53%       1.645ms     274.105us      10.432us        29.26%      12.224us       2.037us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.432us        29.26%      10.432us       1.739us             6  
+                                Activity Buffer Request        71.70%       1.429ms        71.70%       1.429ms       1.429ms       1.792us         5.03%       1.792us       1.792us             1  
+                                    aten::empty_strided         1.63%      32.561us         1.63%      32.561us       5.427us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.99%     179.114us         8.99%     179.114us      29.852us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.72%      34.250us         2.18%      43.390us       3.616us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.46%       9.140us         0.46%       9.140us       0.762us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.04%      40.572us         2.04%      40.572us       6.762us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.24%       4.860us         0.24%       4.860us       4.860us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 885.831us
-Self CUDA time total: 35.842us
+Self CPU time total: 1.993ms
+Self CUDA time total: 35.648us
 
 
 
@@ -4405,23 +4405,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     348.158us      1221.01%     348.158us     348.158us             1  
-                                      hf_kernels_rotary         7.73%     158.832us        99.76%       2.051ms       2.051ms       0.000us         0.00%      29.858us      29.858us             1  
-                          _rotary_dba7d1e::apply_rotary         2.18%      44.723us         4.13%      84.825us      14.138us      20.674us        72.50%      20.674us       3.446us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      20.674us        72.50%      20.674us       3.446us             6  
-                                            aten::clone         1.24%      25.490us        85.81%       1.764ms     294.032us       0.000us         0.00%       9.184us       1.531us             6  
-                                            aten::copy_         1.80%      37.082us        83.01%       1.707ms     284.462us       7.840us        27.50%       9.184us       1.531us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.840us        27.50%       7.840us       1.307us             6  
-                                Activity Buffer Request        70.14%       1.442ms        70.14%       1.442ms       1.442ms       1.344us         4.71%       1.344us       1.344us             1  
-                                    aten::empty_strided         1.55%      31.931us         1.55%      31.931us       5.322us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        11.07%     227.598us        11.07%     227.598us      37.933us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.67%      34.312us         2.11%      43.312us       3.609us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.44%       9.000us         0.44%       9.000us       0.750us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         1.95%      40.102us         1.95%      40.102us       6.684us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.24%       4.880us         0.24%       4.880us       4.880us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     347.675us      1229.10%     347.675us     347.675us             1  
+                                      hf_kernels_rotary         8.06%     160.274us        99.76%       1.984ms       1.984ms       0.000us         0.00%      29.631us      29.631us             1  
+                          _rotary_dba7d1e::apply_rotary         2.18%      43.331us         4.28%      85.164us      14.194us      20.511us        72.51%      20.511us       3.418us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      20.511us        72.51%      20.511us       3.418us             6  
+                                            aten::clone         1.13%      22.531us        85.26%       1.696ms     282.610us       0.000us         0.00%       9.120us       1.520us             6  
+                                            aten::copy_         1.97%      39.252us        82.52%       1.641ms     273.528us       7.776us        27.49%       9.120us       1.520us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.776us        27.49%       7.776us       1.296us             6  
+                                Activity Buffer Request        71.58%       1.424ms        71.58%       1.424ms       1.424ms       1.344us         4.75%       1.344us       1.344us             1  
+                                    aten::empty_strided         1.61%      31.959us         1.61%      31.959us       5.326us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.97%     178.354us         8.97%     178.354us      29.726us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.68%      33.430us         2.16%      42.920us       3.577us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.48%       9.490us         0.48%       9.490us       0.791us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.10%      41.833us         2.10%      41.833us       6.972us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.24%       4.801us         0.24%       4.801us       4.801us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.056ms
-Self CUDA time total: 28.514us
+Self CPU time total: 1.989ms
+Self CUDA time total: 28.287us
 
 
 
@@ -4431,23 +4431,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     344.320us       959.86%     344.320us     344.320us             1  
-                                      hf_kernels_rotary        18.29%     156.315us        99.44%     849.960us     849.960us       0.000us         0.00%      37.664us      37.664us             1  
-                          _rotary_dba7d1e::apply_rotary         5.15%      43.990us        10.72%      91.654us      15.276us      25.312us        70.56%      25.312us       4.219us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      25.312us        70.56%      25.312us       4.219us             6  
-                                            aten::clone         2.62%      22.368us        65.70%     561.560us      93.593us       0.000us         0.00%      12.352us       2.059us             6  
-                                            aten::copy_         4.13%      35.283us        59.24%     506.308us      84.385us      10.560us        29.44%      12.352us       2.059us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.560us        29.44%      10.560us       1.760us             6  
-                                Activity Buffer Request        29.39%     251.239us        29.39%     251.239us     251.239us       1.792us         5.00%       1.792us       1.792us             1  
-                                    aten::empty_strided         3.85%      32.884us         3.85%      32.884us       5.481us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        25.71%     219.786us        25.71%     219.786us      36.631us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.67%      31.402us         4.73%      40.431us       3.369us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.06%       9.029us         1.06%       9.029us       0.752us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         5.58%      47.664us         5.58%      47.664us       7.944us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.56%       4.781us         0.56%       4.781us       4.781us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     341.434us       959.52%     341.434us     341.434us             1  
+                                      hf_kernels_rotary        20.68%     156.375us        99.37%     751.248us     751.248us       0.000us         0.00%      37.312us      37.312us             1  
+                          _rotary_dba7d1e::apply_rotary         5.66%      42.780us        11.14%      84.232us      14.039us      25.184us        70.77%      25.184us       4.197us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      25.184us        70.77%      25.184us       4.197us             6  
+                                            aten::clone         3.01%      22.779us        61.92%     468.081us      78.014us       0.000us         0.00%      12.128us       2.021us             6  
+                                            aten::copy_         4.78%      36.161us        54.65%     413.150us      68.858us      10.400us        29.23%      12.128us       2.021us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.400us        29.23%      10.400us       1.733us             6  
+                                Activity Buffer Request        26.22%     198.225us        26.22%     198.225us     198.225us       1.728us         4.86%       1.728us       1.728us             1  
+                                    aten::empty_strided         4.25%      32.152us         4.25%      32.152us       5.359us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        23.65%     178.764us        23.65%     178.764us      29.794us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.40%      33.290us         5.63%      42.560us       3.547us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.23%       9.270us         1.23%       9.270us       0.773us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.48%      41.452us         5.48%      41.452us       6.909us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.63%       4.741us         0.63%       4.741us       4.741us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 854.741us
-Self CUDA time total: 35.872us
+Self CPU time total: 755.989us
+Self CUDA time total: 35.584us
 
 
 
@@ -4457,23 +4457,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     335.158us       593.10%     335.158us     335.158us             1  
-                                      hf_kernels_rotary        18.22%     154.324us        99.44%     842.379us     842.379us       0.000us         0.00%      59.390us      59.390us             1  
-                          _rotary_dba7d1e::apply_rotary         4.99%      42.273us         9.84%      83.374us      13.896us      39.454us        69.82%      39.454us       6.576us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      39.454us        69.82%      39.454us       6.576us             6  
-                                            aten::clone         2.56%      21.663us        66.58%     564.010us      94.002us       0.000us         0.00%      19.936us       3.323us             6  
-                                            aten::copy_         4.16%      35.260us        60.33%     511.017us      85.169us      17.056us        30.18%      19.936us       3.323us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.056us        30.18%      17.056us       2.843us             6  
-                                Activity Buffer Request        30.26%     256.319us        30.26%     256.319us     256.319us       2.880us         5.10%       2.880us       2.880us             1  
-                                    aten::empty_strided         3.70%      31.330us         3.70%      31.330us       5.222us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        25.90%     219.438us        25.90%     219.438us      36.573us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.75%      31.762us         4.80%      40.671us       3.389us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.05%       8.909us         1.05%       8.909us       0.742us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         4.85%      41.101us         4.85%      41.101us       6.850us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.56%       4.710us         0.56%       4.710us       4.710us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     349.886us       617.06%     349.886us     349.886us             1  
+                                      hf_kernels_rotary        15.93%     158.238us        99.46%     988.285us     988.285us       0.000us         0.00%      59.582us      59.582us             1  
+                          _rotary_dba7d1e::apply_rotary         4.43%      44.009us         8.77%      87.171us      14.528us      39.742us        70.09%      39.742us       6.624us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      39.742us        70.09%      39.742us       6.624us             6  
+                                            aten::clone         2.20%      21.907us        70.33%     698.845us     116.474us       0.000us         0.00%      19.840us       3.307us             6  
+                                            aten::copy_         3.76%      37.392us        65.02%     646.067us     107.678us      16.960us        29.91%      19.840us       3.307us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.960us        29.91%      16.960us       2.827us             6  
+                                Activity Buffer Request        43.30%     430.221us        43.30%     430.221us     430.221us       2.880us         5.08%       2.880us       2.880us             1  
+                                    aten::empty_strided         3.11%      30.871us         3.11%      30.871us       5.145us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        17.96%     178.454us        17.96%     178.454us      29.742us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.43%      34.051us         4.43%      44.031us       3.669us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.00%       9.980us         1.00%       9.980us       0.832us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         4.34%      43.162us         4.34%      43.162us       7.194us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.54%       5.320us         0.54%       5.320us       5.320us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 847.089us
-Self CUDA time total: 56.510us
+Self CPU time total: 993.605us
+Self CUDA time total: 56.702us
 
 
 
@@ -4483,23 +4483,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     369.080us       312.82%     369.080us     369.080us             1  
-                                      hf_kernels_rotary        20.18%     177.506us        99.45%     874.621us     874.621us       0.000us         0.00%     134.912us     134.912us             1  
-                                            aten::clone         2.49%      21.878us        64.31%     565.600us      94.267us       0.000us         0.00%      69.696us      11.616us             6  
-                                            aten::copy_         4.23%      37.163us        58.33%     512.969us      85.495us      52.768us        44.72%      69.696us      11.616us             6  
-                          _rotary_dba7d1e::apply_rotary         5.24%      46.042us        10.09%      88.704us      14.784us      65.216us        55.28%      65.216us      10.869us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      65.216us        55.28%      65.216us      10.869us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      52.768us        44.72%      52.768us       8.795us             6  
-                                Activity Buffer Request        28.97%     254.819us        28.97%     254.819us     254.819us      16.928us        14.35%      16.928us      16.928us             1  
-                                    aten::empty_strided         3.50%      30.753us         3.50%      30.753us       5.126us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        25.13%     220.987us        25.13%     220.987us      36.831us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.86%      33.990us         4.87%      42.811us       3.568us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.00%       8.821us         1.00%       8.821us       0.735us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         4.85%      42.662us         4.85%      42.662us       7.110us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.55%       4.870us         0.55%       4.870us       4.870us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     352.574us       297.38%     352.574us     352.574us             1  
+                                      hf_kernels_rotary        18.56%     157.003us        99.43%     841.041us     841.041us       0.000us         0.00%     135.680us     135.680us             1  
+                                            aten::clone         2.59%      21.881us        65.75%     556.174us      92.696us       0.000us         0.00%      69.984us      11.664us             6  
+                                            aten::copy_         4.37%      36.992us        59.34%     501.912us      83.652us      52.864us        44.59%      69.984us      11.664us             6  
+                          _rotary_dba7d1e::apply_rotary         5.11%      43.221us        10.14%      85.754us      14.292us      65.696us        55.41%      65.696us      10.949us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      65.696us        55.41%      65.696us      10.949us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      52.864us        44.59%      52.864us       8.811us             6  
+                                Activity Buffer Request        33.65%     284.597us        33.65%     284.597us     284.597us      17.120us        14.44%      17.120us      17.120us             1  
+                                    aten::empty_strided         3.83%      32.381us         3.83%      32.381us       5.397us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        21.32%     180.323us        21.32%     180.323us      30.054us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.89%      32.880us         4.98%      42.110us       3.509us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.09%       9.230us         1.09%       9.230us       0.769us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.03%      42.533us         5.03%      42.533us       7.089us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.57%       4.810us         0.57%       4.810us       4.810us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 879.491us
-Self CUDA time total: 117.984us
+Self CPU time total: 845.851us
+Self CUDA time total: 118.560us
 
 
 
@@ -4509,23 +4509,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     360.471us       637.52%     360.471us     360.471us             1  
-                                      hf_kernels_rotary        18.70%     161.865us        99.47%     860.760us     860.760us       0.000us         0.00%      59.391us      59.391us             1  
-                          _rotary_dba7d1e::apply_rotary         5.21%      45.111us        10.32%      89.333us      14.889us      39.487us        69.84%      39.487us       6.581us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      39.487us        69.84%      39.487us       6.581us             6  
-                                            aten::clone         2.76%      23.842us        65.28%     564.941us      94.157us       0.000us         0.00%      19.904us       3.317us             6  
-                                            aten::copy_         4.31%      37.312us        58.89%     509.589us      84.931us      17.056us        30.16%      19.904us       3.317us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.056us        30.16%      17.056us       2.843us             6  
-                                Activity Buffer Request        29.00%     250.989us        29.00%     250.989us     250.989us       2.848us         5.04%       2.848us       2.848us             1  
-                                    aten::empty_strided         3.64%      31.510us         3.64%      31.510us       5.252us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        25.57%     221.288us        25.57%     221.288us      36.881us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.04%      34.983us         5.16%      44.621us       3.718us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.11%       9.638us         1.11%       9.638us       0.803us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         5.11%      44.222us         5.11%      44.222us       7.370us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.53%       4.600us         0.53%       4.600us       4.600us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     341.982us       603.45%     341.982us     341.982us             1  
+                                      hf_kernels_rotary        18.98%     155.712us        99.43%     815.710us     815.710us       0.000us         0.00%      59.487us      59.487us             1  
+                          _rotary_dba7d1e::apply_rotary         5.25%      43.112us        10.37%      85.045us      14.174us      39.839us        70.30%      39.839us       6.640us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      39.839us        70.30%      39.839us       6.640us             6  
+                                            aten::clone         2.51%      20.600us        64.82%     531.763us      88.627us       0.000us         0.00%      19.648us       3.275us             6  
+                                            aten::copy_         4.52%      37.100us        58.54%     480.262us      80.044us      16.832us        29.70%      19.648us       3.275us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.832us        29.70%      16.832us       2.805us             6  
+                                Activity Buffer Request        32.45%     266.237us        32.45%     266.237us     266.237us       2.816us         4.97%       2.816us       2.816us             1  
+                                    aten::empty_strided         3.77%      30.901us         3.77%      30.901us       5.150us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        21.57%     176.925us        21.57%     176.925us      29.488us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.05%      33.240us         5.26%      43.190us       3.599us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.21%       9.950us         1.21%       9.950us       0.829us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.11%      41.933us         5.11%      41.933us       6.989us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.57%       4.700us         0.57%       4.700us       4.700us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 865.360us
-Self CUDA time total: 56.543us
+Self CPU time total: 820.410us
+Self CUDA time total: 56.671us
 
 
 
@@ -4535,23 +4535,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     348.053us       293.57%     348.053us     348.053us             1  
-                                      hf_kernels_rotary        18.59%     158.086us        99.46%     845.630us     845.630us       0.000us         0.00%     135.933us     135.933us             1  
-                                            aten::clone         2.59%      22.020us        65.95%     560.690us      93.448us       0.000us         0.00%      70.752us      11.792us             6  
-                                            aten::copy_         4.43%      37.632us        59.68%     507.389us      84.565us      53.376us        45.02%      70.752us      11.792us             6  
-                          _rotary_dba7d1e::apply_rotary         5.16%      43.870us        10.14%      86.234us      14.372us      65.181us        54.98%      65.181us      10.864us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      65.181us        54.98%      65.181us      10.864us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.376us        45.02%      53.376us       8.896us             6  
-                                Activity Buffer Request        29.66%     252.179us        29.66%     252.179us     252.179us      17.376us        14.66%      17.376us      17.376us             1  
-                                    aten::empty_strided         3.68%      31.281us         3.68%      31.281us       5.213us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        25.59%     217.578us        25.59%     217.578us      36.263us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.78%      32.121us         4.78%      40.620us       3.385us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.00%       8.499us         1.00%       8.499us       0.708us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         4.98%      42.364us         4.98%      42.364us       7.061us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.54%       4.590us         0.54%       4.590us       4.590us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     388.726us       325.86%     388.726us     388.726us             1  
+                                      hf_kernels_rotary        19.76%     169.936us        99.45%     855.401us     855.401us       0.000us         0.00%     136.923us     136.923us             1  
+                                            aten::clone         2.64%      22.710us        63.15%     543.123us      90.521us       0.000us         0.00%      70.877us      11.813us             6  
+                                            aten::copy_         4.46%      38.370us        56.50%     485.931us      80.988us      53.246us        44.64%      70.877us      11.813us             6  
+                          _rotary_dba7d1e::apply_rotary         5.64%      48.490us        10.91%      93.801us      15.634us      66.046us        55.36%      66.046us      11.008us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      66.046us        55.36%      66.046us      11.008us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.246us        44.64%      53.246us       8.874us             6  
+                                Activity Buffer Request        30.83%     265.147us        30.83%     265.147us     265.147us      17.631us        14.78%      17.631us      17.631us             1  
+                                    aten::empty_strided         4.01%      34.482us         4.01%      34.482us       5.747us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        21.21%     182.414us        21.21%     182.414us      30.402us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.39%      37.781us         5.64%      48.541us       4.045us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.25%      10.760us         1.25%      10.760us       0.897us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.27%      45.311us         5.27%      45.311us       7.552us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.55%       4.700us         0.55%       4.700us       4.700us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 850.220us
-Self CUDA time total: 118.557us
+Self CPU time total: 860.101us
+Self CUDA time total: 119.292us
 
 
 
@@ -4561,23 +4561,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     361.432us       183.93%     361.432us     361.432us             1  
-                                      hf_kernels_rotary        18.55%     158.934us        99.44%     851.909us     851.909us       0.000us         0.00%     220.221us     220.221us             1  
-                          _rotary_dba7d1e::apply_rotary         5.09%      43.629us        10.06%      86.174us      14.362us     115.517us        58.78%     115.517us      19.253us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us     115.517us        58.78%     115.517us      19.253us             6  
-                                            aten::clone         2.64%      22.651us        66.00%     565.440us      94.240us       0.000us         0.00%     104.704us      17.451us             6  
-                                            aten::copy_         4.43%      37.970us        59.78%     512.129us      85.355us      80.992us        41.22%     104.704us      17.451us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      80.992us        41.22%      80.992us      13.499us             6  
-                                Activity Buffer Request        29.36%     251.489us        29.36%     251.489us     251.489us      23.712us        12.07%      23.712us      23.712us             1  
-                                    aten::empty_strided         3.58%      30.660us         3.58%      30.660us       5.110us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        25.99%     222.670us        25.99%     222.670us      37.112us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.80%      32.582us         4.83%      41.361us       3.447us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.02%       8.779us         1.02%       8.779us       0.732us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         4.97%      42.545us         4.97%      42.545us       7.091us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.56%       4.770us         0.56%       4.770us       4.770us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     357.115us       181.96%     357.115us     357.115us             1  
+                                      hf_kernels_rotary        18.86%     155.885us        99.43%     821.750us     821.750us       0.000us         0.00%     219.904us     219.904us             1  
+                          _rotary_dba7d1e::apply_rotary         5.36%      44.321us        10.59%      87.561us      14.594us     115.808us        59.01%     115.808us      19.301us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us     115.808us        59.01%     115.808us      19.301us             6  
+                                            aten::clone         2.51%      20.740us        64.81%     535.643us      89.274us       0.000us         0.00%     104.096us      17.349us             6  
+                                            aten::copy_         4.34%      35.891us        58.73%     485.402us      80.900us      80.448us        40.99%     104.096us      17.349us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      80.448us        40.99%      80.448us      13.408us             6  
+                                Activity Buffer Request        32.66%     269.957us        32.66%     269.957us     269.957us      23.648us        12.05%      23.648us      23.648us             1  
+                                    aten::empty_strided         3.57%      29.501us         3.57%      29.501us       4.917us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        21.72%     179.554us        21.72%     179.554us      29.926us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.97%      32.801us         5.16%      42.661us       3.555us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.19%       9.860us         1.19%       9.860us       0.822us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.23%      43.240us         5.23%      43.240us       7.207us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.57%       4.750us         0.57%       4.750us       4.750us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 856.679us
-Self CUDA time total: 196.509us
+Self CPU time total: 826.500us
+Self CUDA time total: 196.256us
 
 
 
@@ -4587,29 +4587,29 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary        12.27%     154.345us        67.03%     843.460us     843.460us       0.000us         0.00%     849.461us     849.461us             1  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     791.349us       101.00%     791.349us     791.349us             1  
-                                            aten::clone         1.79%      22.531us        44.41%     558.811us      93.135us       0.000us         0.00%     577.848us      96.308us             6  
-                                            aten::copy_         2.94%      36.962us        40.15%     505.198us      84.200us     511.865us        65.33%     577.848us      96.308us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     511.865us        65.33%     511.865us      85.311us             6  
-                          _rotary_dba7d1e::apply_rotary         3.50%      44.071us         7.04%      88.532us      14.755us     271.613us        34.67%     271.613us      45.269us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us     271.613us        34.67%     271.613us      45.269us             6  
-                                Activity Buffer Request        20.09%     252.769us        20.09%     252.769us     252.769us      65.983us         8.42%      65.983us      65.983us             1  
-                                    aten::empty_strided         2.47%      31.082us         2.47%      31.082us       5.180us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        17.12%     215.467us        17.12%     215.467us      35.911us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.61%      32.851us         3.32%      41.772us       3.481us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.71%       8.921us         0.71%       8.921us       0.743us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         3.53%      44.461us         3.53%      44.461us       7.410us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        32.97%     414.834us        32.97%     414.834us     414.834us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary        13.04%     159.984us        66.42%     814.800us     814.800us       0.000us         0.00%     847.705us     847.705us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     789.466us       101.01%     789.466us     789.466us             1  
+                                            aten::clone         1.84%      22.521us        42.98%     527.184us      87.864us       0.000us         0.00%     577.883us      96.314us             6  
+                                            aten::copy_         2.96%      36.311us        38.61%     473.681us      78.947us     511.772us        65.48%     577.883us      96.314us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     511.772us        65.48%     511.772us      85.295us             6  
+                          _rotary_dba7d1e::apply_rotary         3.59%      44.023us         6.92%      84.943us      14.157us     269.822us        34.52%     269.822us      44.970us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us     269.822us        34.52%     269.822us      44.970us             6  
+                                Activity Buffer Request        21.07%     258.456us        21.07%     258.456us     258.456us      66.111us         8.46%      66.111us      66.111us             1  
+                                    aten::empty_strided         2.53%      30.982us         2.53%      30.982us       5.164us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        14.58%     178.914us        14.58%     178.914us      29.819us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.74%      33.620us         3.48%      42.689us       3.557us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.74%       9.069us         0.74%       9.069us       0.756us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         3.34%      40.920us         3.34%      40.920us       6.820us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        33.58%     411.910us        33.58%     411.910us     411.910us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.258ms
-Self CUDA time total: 783.478us
+Self CPU time total: 1.227ms
+Self CUDA time total: 781.594us
 
 
 impl                     wl                  p50(ms)  ok
 hf_kernels_rotary        cuda_B1_S128_H32_D128_R64     0.09  False
 hf_kernels_rotary        cuda_B1_S128_H32_D64_R32     0.09  False
-hf_kernels_rotary        cuda_B1_S128_H8_D128_R64     0.09  False
+hf_kernels_rotary        cuda_B1_S128_H8_D128_R64     0.10  False
 hf_kernels_rotary        cuda_B1_S128_H8_D64_R32     0.08  False
 hf_kernels_rotary        cuda_B1_S2048_H32_D128_R64     0.09  False
 hf_kernels_rotary        cuda_B1_S2048_H32_D64_R32     0.09  False
@@ -4635,13 +4635,12 @@ hf_kernels_rotary        cuda_B2_S512_H8_D64_R32     0.09  False
 
▶ UV Install Logs
Fetching 5 files: 0%| | 0/5 [00:00<?, ?it/s] -Fetching 5 files: 20%|██ | 1/5 [00:00<00:00, 7.39it/s] -Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 16.59it/s] -Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 15.43it/s]
+Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 22.14it/s] +Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 22.12it/s]

Artifacts:

rotary.jsonl diff --git a/rotary/impls/torch_rotary.html b/rotary/impls/torch_rotary.html index f2e07316cf3df8891afa30950cda265901d2fcae..7606a093a65d04c40d580abf67d210368fd50dcd 100644 --- a/rotary/impls/torch_rotary.html +++ b/rotary/impls/torch_rotary.html @@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: nv | 0.23s +Cell: nv | 0.20s | Raw @@ -3887,7 +3887,7 @@ Cell: nv | 0.23s
-
Tue Oct 28 14:08:24 2025       
+
Wed Oct 29 14:26:51 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -3896,7 +3896,7 @@ Cell: nv | 0.23s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   29C    P0             90W /  350W |       0MiB /  46068MiB |     24%      Default |
+| N/A   32C    P0             76W /  350W |       0MiB /  46068MiB |     11%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3920,7 +3920,7 @@ Cell: nv | 0.23s
 ▼ output
  ▶ uv-logs
  | 
-Cell: benchmark | 3.87s
+Cell: benchmark | 3.84s
  | 
 
 Raw
@@ -3999,27 +3999,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.099ms      1229.41%       1.099ms       1.099ms             1  
-                                            torch_eager        14.68%     402.893us        99.74%       2.737ms       2.737ms       0.000us         0.00%      90.654us      90.654us             1  
-                                              aten::mul         6.18%     169.712us        10.63%     291.789us      12.158us      46.975us        52.54%      46.975us       1.957us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      46.975us        52.54%      46.975us       1.957us            24  
-                                            aten::copy_         5.12%     140.498us        62.48%       1.714ms      95.244us      29.151us        32.61%      30.399us       1.689us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.400us        25.05%      22.400us       1.867us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.280us        14.85%      13.280us       1.107us            12  
-                                            aten::clone         1.37%      37.603us        60.57%       1.662ms     277.027us       0.000us         0.00%       7.999us       1.333us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.751us         7.55%       6.751us       1.125us             6  
-                                              aten::sub         1.57%      43.112us         2.52%      69.272us      11.545us       6.688us         7.48%       6.688us       1.115us             6  
-                                              aten::add         1.32%      36.261us         2.18%      59.731us       9.955us       6.592us         7.37%       6.592us       1.099us             6  
-                                Activity Buffer Request        52.27%       1.434ms        52.27%       1.434ms       1.434ms       1.248us         1.40%       1.248us       1.248us             1  
-                                    aten::empty_strided         2.02%      55.541us         2.02%      55.541us       9.257us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         2.66%      72.862us         2.66%      72.862us      12.144us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.02%      82.803us         3.84%     105.504us       4.396us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.83%      22.701us         0.83%      22.701us       0.946us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.69%     238.340us         8.69%     238.340us       4.965us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.26%       7.250us         0.26%       7.250us       7.250us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.124ms      1261.56%       1.124ms       1.124ms             1  
+                                            torch_eager        14.73%     412.767us        99.72%       2.794ms       2.794ms       0.000us         0.00%      90.337us      90.337us             1  
+                                              aten::mul         6.25%     175.043us        11.07%     310.105us      12.921us      46.912us        52.64%      46.912us       1.955us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      46.912us        52.64%      46.912us       1.955us            24  
+                                            aten::copy_         4.12%     115.463us        61.76%       1.730ms      96.132us      28.993us        32.53%      30.210us       1.678us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.368us        25.10%      22.368us       1.864us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.215us        14.83%      13.215us       1.101us            12  
+                                            aten::clone         1.31%      36.692us        59.66%       1.671ms     278.565us       0.000us         0.00%       7.842us       1.307us             6  
+                                              aten::sub         1.68%      47.063us         2.72%      76.213us      12.702us       6.655us         7.47%       6.655us       1.109us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.625us         7.43%       6.625us       1.104us             6  
+                                              aten::add         1.39%      39.044us         2.34%      65.583us      10.930us       6.560us         7.36%       6.560us       1.093us             6  
+                                Activity Buffer Request        52.45%       1.470ms        52.45%       1.470ms       1.470ms       1.217us         1.37%       1.217us       1.217us             1  
+                                    aten::empty_strided         1.99%      55.621us         1.99%      55.621us       9.270us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         2.66%      74.431us         2.66%      74.431us      12.405us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.98%      83.492us         3.80%     106.494us       4.437us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.82%      23.002us         0.82%      23.002us       0.958us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         9.34%     261.675us         9.34%     261.675us       5.452us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.28%       7.890us         0.28%       7.890us       7.890us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.744ms
-Self CUDA time total: 89.406us
+Self CPU time total: 2.802ms
+Self CUDA time total: 89.120us
 
 
 
@@ -4029,27 +4029,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.001ms      1104.88%       1.001ms       1.001ms             1  
-                                            torch_eager        13.31%     340.683us        99.79%       2.555ms       2.555ms       0.000us         0.00%      91.680us      91.680us             1  
-                                              aten::mul         6.04%     154.674us        10.48%     268.377us      11.182us      47.810us        52.79%      47.810us       1.992us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      47.810us        52.79%      47.810us       1.992us            24  
-                                            aten::copy_         4.35%     111.424us        65.16%       1.668ms      92.682us      29.407us        32.47%      30.527us       1.696us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.559us        24.91%      22.559us       1.880us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.343us        14.73%      13.343us       1.112us            12  
-                                            aten::clone         1.08%      27.742us        62.03%       1.588ms     264.676us       0.000us         0.00%       7.968us       1.328us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.848us         7.56%       6.848us       1.141us             6  
-                                              aten::sub         1.52%      38.791us         2.50%      64.042us      10.674us       6.720us         7.42%       6.720us       1.120us             6  
-                                              aten::add         1.27%      32.413us         2.18%      55.903us       9.317us       6.623us         7.31%       6.623us       1.104us             6  
-                                Activity Buffer Request        56.03%       1.434ms        56.03%       1.434ms       1.434ms       1.120us         1.24%       1.120us       1.120us             1  
-                                    aten::empty_strided         1.42%      36.451us         1.42%      36.451us       6.075us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         2.10%      53.872us         2.10%      53.872us       8.979us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.86%      73.182us         3.65%      93.342us       3.889us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.79%      20.160us         0.79%      20.160us       0.840us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         9.02%     231.028us         9.02%     231.028us       4.813us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.21%       5.420us         0.21%       5.420us       5.420us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     968.092us      1071.28%     968.092us     968.092us             1  
+                                            torch_eager        12.50%     317.076us        99.79%       2.532ms       2.532ms       0.000us         0.00%      91.488us      91.488us             1  
+                                              aten::mul         6.07%     153.959us        10.35%     262.528us      10.939us      47.648us        52.73%      47.648us       1.985us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      47.648us        52.73%      47.648us       1.985us            24  
+                                            aten::copy_         4.16%     105.603us        65.14%       1.653ms      91.828us      29.344us        32.47%      30.464us       1.692us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.592us        25.00%      22.592us       1.883us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.376us        14.80%      13.376us       1.115us            12  
+                                            aten::clone         1.12%      28.391us        62.74%       1.592ms     265.351us       0.000us         0.00%       7.872us       1.312us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.752us         7.47%       6.752us       1.125us             6  
+                                              aten::sub         1.55%      39.261us         2.49%      63.132us      10.522us       6.688us         7.40%       6.688us       1.115us             6  
+                                              aten::add         1.47%      37.180us         2.35%      59.741us       9.957us       6.688us         7.40%       6.688us       1.115us             6  
+                                Activity Buffer Request        56.17%       1.425ms        56.17%       1.425ms       1.425ms       1.120us         1.24%       1.120us       1.120us             1  
+                                    aten::empty_strided         2.04%      51.662us         2.04%      51.662us       8.610us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         2.12%      53.792us         2.12%      53.792us       8.965us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.04%      77.153us         3.82%      96.932us       4.039us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.78%      19.779us         0.78%      19.779us       0.824us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.79%     223.101us         8.79%     223.101us       4.648us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.21%       5.210us         0.21%       5.210us       5.210us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.560ms
-Self CUDA time total: 90.560us
+Self CPU time total: 2.538ms
+Self CUDA time total: 90.368us
 
 
 
@@ -4059,27 +4059,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     944.341us      1003.42%     944.341us     944.341us             1  
-                                            torch_eager        12.66%     316.554us        99.80%       2.495ms       2.495ms       0.000us         0.00%      95.424us      95.424us             1  
-                                              aten::mul         6.01%     150.161us        10.40%     259.987us      10.833us      48.863us        51.92%      48.863us       2.036us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      48.863us        51.92%      48.863us       2.036us            24  
-                                            aten::copy_         4.06%     101.511us        66.21%       1.655ms      91.941us      30.785us        32.71%      32.097us       1.783us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      23.009us        24.45%      23.009us       1.917us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.464us        15.37%      14.464us       1.205us            12  
-                                            aten::clone         1.08%      26.971us        63.11%       1.577ms     262.904us       0.000us         0.00%       9.088us       1.515us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.776us         8.26%       7.776us       1.296us             6  
-                                              aten::add         1.43%      35.631us         2.33%      58.151us       9.692us       7.233us         7.69%       7.233us       1.205us             6  
-                                              aten::sub         1.42%      35.432us         2.34%      58.413us       9.736us       7.231us         7.68%       7.231us       1.205us             6  
-                                Activity Buffer Request        57.41%       1.435ms        57.41%       1.435ms       1.435ms       1.312us         1.39%       1.312us       1.312us             1  
-                                    aten::empty_strided         1.23%      30.860us         1.23%      30.860us       5.143us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         2.03%      50.692us         2.03%      50.692us       8.449us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.76%      69.107us         3.55%      88.725us       3.697us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.78%      19.618us         0.78%      19.618us       0.817us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.92%     222.961us         8.92%     222.961us       4.645us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.20%       5.071us         0.20%       5.071us       5.071us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.007ms      1071.77%       1.007ms       1.007ms             1  
+                                            torch_eager        12.81%     333.813us        99.77%       2.600ms       2.600ms       0.000us         0.00%      95.234us      95.234us             1  
+                                              aten::mul         6.17%     160.752us        10.75%     280.063us      11.669us      48.706us        51.86%      48.706us       2.029us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      48.706us        51.86%      48.706us       2.029us            24  
+                                            aten::copy_         4.30%     112.081us        64.85%       1.690ms      93.891us      30.753us        32.74%      32.065us       1.781us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      23.009us        24.50%      23.009us       1.917us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.463us        15.40%      14.463us       1.205us            12  
+                                            aten::clone         1.08%      28.070us        62.18%       1.621ms     270.093us       0.000us         0.00%       9.056us       1.509us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.744us         8.25%       7.744us       1.291us             6  
+                                              aten::sub         1.50%      39.201us         2.50%      65.063us      10.844us       7.263us         7.73%       7.263us       1.211us             6  
+                                              aten::add         1.40%      36.592us         2.30%      59.882us       9.980us       7.200us         7.67%       7.200us       1.200us             6  
+                                Activity Buffer Request        55.61%       1.449ms        55.61%       1.449ms       1.449ms       1.312us         1.40%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.87%      48.773us         1.87%      48.773us       8.129us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         2.21%      57.593us         2.21%      57.593us       9.599us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.85%      74.230us         3.62%      94.450us       3.935us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.78%      20.220us         0.78%      20.220us       0.842us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         9.19%     239.464us         9.19%     239.464us       4.989us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.23%       5.970us         0.23%       5.970us       5.970us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.500ms
-Self CUDA time total: 94.112us
+Self CPU time total: 2.606ms
+Self CUDA time total: 93.922us
 
 
 
@@ -4089,27 +4089,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     949.272us       934.99%     949.272us     949.272us             1  
-                                            torch_eager        11.74%     319.184us        99.83%       2.715ms       2.715ms       0.000us         0.00%     102.839us     102.839us             1  
-                                              aten::mul         5.42%     147.290us         9.69%     263.662us      10.986us      53.022us        52.22%      53.022us       2.209us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      53.022us        52.22%      53.022us       2.209us            24  
-                                            aten::copy_         3.75%     101.924us        68.58%       1.865ms     103.635us      32.444us        31.96%      33.755us       1.875us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.637us        24.27%      24.637us       2.053us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.062us        15.82%      16.062us       1.339us            12  
-                                            aten::clone         1.13%      30.729us        66.03%       1.796ms     299.314us       0.000us         0.00%       9.118us       1.520us             6  
-                                              aten::add         1.18%      32.140us         2.02%      54.851us       9.142us       8.032us         7.91%       8.032us       1.339us             6  
-                                              aten::sub         1.29%      35.030us         2.16%      58.621us       9.770us       8.030us         7.91%       8.030us       1.338us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.807us         7.69%       7.807us       1.301us             6  
-                                Activity Buffer Request        53.21%       1.447ms        53.21%       1.447ms       1.447ms       1.311us         1.29%       1.311us       1.311us             1  
-                                    aten::empty_strided         1.17%      31.801us         1.17%      31.801us       5.300us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         9.34%     254.009us         9.34%     254.009us      42.335us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.60%      70.842us         3.35%      90.984us       3.791us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.74%      20.142us         0.74%      20.142us       0.839us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.27%     224.985us         8.27%     224.985us       4.687us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.17%       4.671us         0.17%       4.671us       4.671us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     976.889us       967.02%     976.889us     976.889us             1  
+                                            torch_eager        12.01%     329.416us        99.82%       2.739ms       2.739ms       0.000us         0.00%     102.333us     102.333us             1  
+                                              aten::mul         5.67%     155.545us         9.73%     266.927us      11.122us      52.800us        52.27%      52.800us       2.200us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      52.800us        52.27%      52.800us       2.200us            24  
+                                            aten::copy_         3.82%     104.765us        68.18%       1.871ms     103.922us      32.349us        32.02%      33.661us       1.870us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.574us        24.33%      24.574us       2.048us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      15.872us        15.71%      15.872us       1.323us            12  
+                                            aten::clone         1.07%      29.290us        65.23%       1.790ms     298.277us       0.000us         0.00%       9.087us       1.515us             6  
+                                              aten::sub         1.39%      38.150us         2.28%      62.431us      10.405us       7.936us         7.86%       7.936us       1.323us             6  
+                                              aten::add         1.24%      34.113us         2.07%      56.743us       9.457us       7.936us         7.86%       7.936us       1.323us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.775us         7.70%       7.775us       1.296us             6  
+                                Activity Buffer Request        52.33%       1.436ms        52.33%       1.436ms       1.436ms       1.312us         1.30%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.16%      31.821us         1.16%      31.821us       5.304us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         9.42%     258.335us         9.42%     258.335us      43.056us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.63%      72.071us         3.33%      91.411us       3.809us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.70%      19.340us         0.70%      19.340us       0.806us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.39%     230.176us         8.39%     230.176us       4.795us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.18%       5.010us         0.18%       5.010us       5.010us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.720ms
-Self CUDA time total: 101.528us
+Self CPU time total: 2.744ms
+Self CUDA time total: 101.021us
 
 
 
@@ -4119,27 +4119,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     944.887us      1005.38%     944.887us     944.887us             1  
-                                            torch_eager        11.86%     320.838us        99.82%       2.700ms       2.700ms       0.000us         0.00%      95.295us      95.295us             1  
-                                              aten::mul         5.37%     145.335us         9.42%     254.837us      10.618us      49.024us        52.16%      49.024us       2.043us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      49.024us        52.16%      49.024us       2.043us            24  
-                                            aten::copy_         3.87%     104.672us        68.80%       1.861ms     103.396us      30.783us        32.75%      32.095us       1.783us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.912us        24.38%      22.912us       1.909us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.176us        15.08%      14.176us       1.181us            12  
-                                            aten::clone         1.07%      28.861us        66.14%       1.789ms     298.231us       0.000us         0.00%       9.183us       1.530us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.871us         8.37%       7.871us       1.312us             6  
-                                              aten::sub         1.26%      33.972us         2.12%      57.464us       9.577us       7.103us         7.56%       7.103us       1.184us             6  
-                                              aten::add         1.16%      31.253us         1.99%      53.964us       8.994us       7.073us         7.53%       7.073us       1.179us             6  
-                                Activity Buffer Request        53.80%       1.456ms        53.80%       1.456ms       1.456ms       1.312us         1.40%       1.312us       1.312us             1  
-                                    aten::empty_strided         1.17%      31.633us         1.17%      31.633us       5.272us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         8.82%     238.648us         8.82%     238.648us      39.775us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.67%      72.119us         3.38%      91.532us       3.814us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.72%      19.413us         0.72%      19.413us       0.809us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.06%     217.970us         8.06%     217.970us       4.541us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.18%       4.990us         0.18%       4.990us       4.990us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     972.954us      1035.95%     972.954us     972.954us             1  
+                                            torch_eager        11.82%     323.628us        99.83%       2.734ms       2.734ms       0.000us         0.00%      95.231us      95.231us             1  
+                                              aten::mul         5.48%     150.092us         9.71%     265.906us      11.079us      48.958us        52.13%      48.958us       2.040us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      48.958us        52.13%      48.958us       2.040us            24  
+                                            aten::copy_         4.01%     109.805us        68.55%       1.878ms     104.307us      30.784us        32.78%      32.096us       1.783us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.912us        24.40%      22.912us       1.909us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.177us        15.09%      14.177us       1.181us            12  
+                                            aten::clone         0.98%      26.740us        65.50%       1.794ms     299.012us       0.000us         0.00%       9.184us       1.531us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.872us         8.38%       7.872us       1.312us             6  
+                                              aten::sub         1.35%      37.100us         2.22%      60.781us      10.130us       7.106us         7.57%       7.106us       1.184us             6  
+                                              aten::add         1.26%      34.471us         2.07%      56.641us       9.440us       7.071us         7.53%       7.071us       1.178us             6  
+                                Activity Buffer Request        53.28%       1.459ms        53.28%       1.459ms       1.459ms       1.312us         1.40%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.12%      30.591us         1.12%      30.591us       5.098us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.84%     242.034us         8.84%     242.034us      40.339us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.64%      72.284us         3.37%      92.363us       3.848us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.73%      20.079us         0.73%      20.079us       0.837us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.33%     228.067us         8.33%     228.067us       4.751us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.17%       4.701us         0.17%       4.701us       4.701us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.705ms
-Self CUDA time total: 93.983us
+Self CPU time total: 2.739ms
+Self CUDA time total: 93.919us
 
 
 
@@ -4149,27 +4149,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     912.250us       902.45%     912.250us     912.250us             1  
-                                            torch_eager        10.84%     287.380us        99.80%       2.646ms       2.646ms       0.000us         0.00%     102.398us     102.398us             1  
-                                              aten::mul         5.43%     143.901us         9.61%     254.716us      10.613us      52.767us        52.20%      52.767us       2.199us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      52.767us        52.20%      52.767us       2.199us            24  
-                                            aten::copy_         3.82%     101.373us        69.76%       1.849ms     102.733us      32.416us        32.07%      33.728us       1.874us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.608us        24.34%      24.608us       2.051us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      15.903us        15.73%      15.903us       1.325us            12  
-                                            aten::clone         0.89%      23.520us        66.94%       1.774ms     295.745us       0.000us         0.00%       9.120us       1.520us             6  
-                                              aten::add         1.25%      33.223us         2.12%      56.323us       9.387us       7.968us         7.88%       7.968us       1.328us             6  
-                                              aten::sub         1.34%      35.391us         2.21%      58.453us       9.742us       7.935us         7.85%       7.935us       1.322us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.808us         7.72%       7.808us       1.301us             6  
-                                Activity Buffer Request        54.59%       1.447ms        54.59%       1.447ms       1.447ms       1.312us         1.30%       1.312us       1.312us             1  
-                                    aten::empty_strided         1.14%      30.292us         1.14%      30.292us       5.049us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         9.04%     239.538us         9.04%     239.538us      39.923us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.52%      66.730us         3.23%      85.664us       3.569us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.71%      18.934us         0.71%      18.934us       0.789us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.23%     218.091us         8.23%     218.091us       4.544us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.20%       5.360us         0.20%       5.360us       5.360us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     940.506us       929.78%     940.506us     940.506us             1  
+                                            torch_eager        10.47%     280.203us        99.80%       2.672ms       2.672ms       0.000us         0.00%     102.466us     102.466us             1  
+                                              aten::mul         5.68%     151.942us         9.93%     265.874us      11.078us      52.767us        52.17%      52.767us       2.199us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      52.767us        52.17%      52.767us       2.199us            24  
+                                            aten::copy_         3.99%     106.699us        69.68%       1.866ms     103.641us      32.384us        32.01%      33.696us       1.872us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.672us        24.39%      24.672us       2.056us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.003us        15.82%      16.003us       1.334us            12  
+                                            aten::clone         0.80%      21.540us        66.42%       1.778ms     296.379us       0.000us         0.00%       9.024us       1.504us             6  
+                                              aten::sub         1.42%      38.052us         2.40%      64.133us      10.689us       8.002us         7.91%       8.002us       1.334us             6  
+                                              aten::add         1.23%      32.860us         2.10%      56.182us       9.364us       8.001us         7.91%       8.001us       1.333us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.712us         7.62%       7.712us       1.285us             6  
+                                Activity Buffer Request        54.45%       1.458ms        54.45%       1.458ms       1.458ms       1.312us         1.30%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.14%      30.450us         1.14%      30.450us       5.075us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.74%     234.006us         8.74%     234.006us      39.001us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.58%      69.109us         3.28%      87.850us       3.660us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.70%      18.741us         0.70%      18.741us       0.781us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.61%     230.527us         8.61%     230.527us       4.803us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.20%       5.400us         0.20%       5.400us       5.400us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.651ms
-Self CUDA time total: 101.086us
+Self CPU time total: 2.677ms
+Self CUDA time total: 101.154us
 
 
 
@@ -4179,27 +4179,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     920.762us       761.21%     920.762us     920.762us             1  
-                                            torch_eager        10.74%     283.666us        99.80%       2.636ms       2.636ms       0.000us         0.00%     122.785us     122.785us             1  
-                                              aten::mul         5.61%     148.102us         9.80%     258.888us      10.787us      62.177us        51.40%      62.177us       2.591us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      62.177us        51.40%      62.177us       2.591us            24  
-                                            aten::copy_         4.01%     105.842us        69.73%       1.842ms     102.324us      39.520us        32.67%      41.344us       2.297us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.896us        23.89%      28.896us       2.408us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.264us        15.93%      19.264us       1.605us            12  
-                                            aten::clone         0.81%      21.319us        66.69%       1.761ms     293.582us       0.000us         0.00%      12.448us       2.075us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.624us         8.78%      10.624us       1.771us             6  
-                                              aten::add         1.23%      32.431us         2.08%      54.912us       9.152us       9.696us         8.02%       9.696us       1.616us             6  
-                                              aten::sub         1.34%      35.510us         2.24%      59.050us       9.842us       9.568us         7.91%       9.568us       1.595us             6  
-                                Activity Buffer Request        54.62%       1.443ms        54.62%       1.443ms       1.443ms       1.824us         1.51%       1.824us       1.824us             1  
-                                    aten::empty_strided         1.13%      29.871us         1.13%      29.871us       4.979us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         8.76%     231.329us         8.76%     231.329us      38.555us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.53%      66.872us         3.28%      86.661us       3.611us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.75%      19.789us         0.75%      19.789us       0.825us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.28%     218.631us         8.28%     218.631us       4.555us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.20%       5.190us         0.20%       5.190us       5.190us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.015ms       844.44%       1.015ms       1.015ms             1  
+                                            torch_eager        10.99%     299.529us        99.80%       2.720ms       2.720ms       0.000us         0.00%     122.045us     122.045us             1  
+                                              aten::mul         5.97%     162.734us        10.28%     280.227us      11.676us      61.856us        51.45%      61.856us       2.577us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      61.856us        51.45%      61.856us       2.577us            24  
+                                            aten::copy_         4.97%     135.364us        68.63%       1.870ms     103.912us      39.199us        32.61%      41.023us       2.279us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.704us        23.88%      28.704us       2.392us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.166us        15.94%      19.166us       1.597us            12  
+                                            aten::clone         0.84%      22.992us        64.39%       1.755ms     292.512us       0.000us         0.00%      12.319us       2.053us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.495us         8.73%      10.495us       1.749us             6  
+                                              aten::add         1.19%      32.530us         2.08%      56.691us       9.448us       9.598us         7.98%       9.598us       1.600us             6  
+                                              aten::sub         1.40%      38.111us         2.30%      62.811us      10.468us       9.568us         7.96%       9.568us       1.595us             6  
+                                Activity Buffer Request        52.53%       1.432ms        52.53%       1.432ms       1.432ms       1.824us         1.52%       1.824us       1.824us             1  
+                                    aten::empty_strided         1.18%      32.290us         1.18%      32.290us       5.382us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.53%     232.585us         8.53%     232.585us      38.764us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.71%      73.938us         3.49%      95.000us       3.958us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.77%      21.062us         0.77%      21.062us       0.878us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.70%     237.086us         8.70%     237.086us       4.939us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.20%       5.570us         0.20%       5.570us       5.570us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.641ms
-Self CUDA time total: 120.961us
+Self CPU time total: 2.726ms
+Self CUDA time total: 120.221us
 
 
 
@@ -4209,27 +4209,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     939.640us       544.89%     939.640us     939.640us             1  
-                                            torch_eager        12.08%     323.576us        99.81%       2.674ms       2.674ms       0.000us         0.00%     175.325us     175.325us             1  
-                                              aten::mul         5.49%     147.107us         9.55%     255.901us      10.663us      89.504us        51.90%      89.504us       3.729us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      89.504us        51.90%      89.504us       3.729us            24  
-                                            aten::copy_         3.83%     102.724us        68.48%       1.835ms     101.930us      57.918us        33.59%      60.798us       3.378us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.734us        23.62%      40.734us       3.395us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      25.023us        14.51%      25.023us       2.085us            12  
-                                            aten::clone         1.06%      28.292us        65.67%       1.760ms     293.252us       0.000us         0.00%      20.064us       3.344us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.184us         9.96%      17.184us       2.864us             6  
-                                              aten::add         1.22%      32.572us         2.05%      54.872us       9.145us      12.512us         7.26%      12.512us       2.085us             6  
-                                              aten::sub         1.28%      34.403us         2.15%      57.513us       9.586us      12.511us         7.26%      12.511us       2.085us             6  
-                                Activity Buffer Request        53.69%       1.438ms        53.69%       1.438ms       1.438ms       2.880us         1.67%       2.880us       2.880us             1  
-                                    aten::empty_strided         1.12%      30.100us         1.12%      30.100us       5.017us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         8.57%     229.599us         8.57%     229.599us      38.267us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.59%      69.394us         3.32%      89.005us       3.709us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.73%      19.611us         0.73%      19.611us       0.817us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.14%     218.155us         8.14%     218.155us       4.545us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.19%       5.191us         0.19%       5.191us       5.191us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     951.101us       552.87%     951.101us     951.101us             1  
+                                            torch_eager        11.67%     313.772us        99.81%       2.683ms       2.683ms       0.000us         0.00%     174.878us     174.878us             1  
+                                              aten::mul         5.73%     154.081us         9.89%     265.836us      11.076us      89.599us        52.08%      89.599us       3.733us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      89.599us        52.08%      89.599us       3.733us            24  
+                                            aten::copy_         3.89%     104.453us        68.40%       1.838ms     102.128us      57.664us        33.52%      60.512us       3.362us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.832us        23.74%      40.832us       3.403us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.767us        14.40%      24.767us       2.064us            12  
+                                            aten::clone         1.01%      27.120us        65.39%       1.758ms     292.937us       0.000us         0.00%      19.680us       3.280us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.832us         9.78%      16.832us       2.805us             6  
+                                              aten::add         1.27%      34.231us         2.14%      57.531us       9.588us      12.416us         7.22%      12.416us       2.069us             6  
+                                              aten::sub         1.34%      36.001us         2.22%      59.581us       9.930us      12.351us         7.18%      12.351us       2.059us             6  
+                                Activity Buffer Request        53.45%       1.437ms        53.45%       1.437ms       1.437ms       2.848us         1.66%       2.848us       2.848us             1  
+                                    aten::empty_strided         1.13%      30.290us         1.13%      30.290us       5.048us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.55%     229.865us         8.55%     229.865us      38.311us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.63%      70.721us         3.36%      90.322us       3.763us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.73%      19.601us         0.73%      19.601us       0.817us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.41%     225.976us         8.41%     225.976us       4.708us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.19%       5.001us         0.19%       5.001us       5.001us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.679ms
-Self CUDA time total: 172.445us
+Self CPU time total: 2.688ms
+Self CUDA time total: 172.030us
 
 
 
@@ -4239,27 +4239,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     910.515us       751.54%     910.515us     910.515us             1  
-                                            torch_eager        19.90%     282.972us        99.65%       1.417ms       1.417ms       0.000us         0.00%     123.009us     123.009us             1  
-                                              aten::mul        10.25%     145.781us        17.92%     254.851us      10.619us      62.146us        51.30%      62.146us       2.589us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      62.146us        51.30%      62.146us       2.589us            24  
-                                            aten::copy_         7.07%     100.509us        44.20%     628.439us      34.913us      39.743us        32.80%      41.599us       2.311us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      29.055us        23.98%      29.055us       2.421us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.264us        15.90%      19.264us       1.605us            12  
-                                            aten::clone         1.59%      22.604us        38.82%     551.881us      91.980us       0.000us         0.00%      12.544us       2.091us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.688us         8.82%      10.688us       1.781us             6  
-                                              aten::add         2.23%      31.661us         3.79%      53.922us       8.987us       9.633us         7.95%       9.633us       1.606us             6  
-                                              aten::sub         2.49%      35.352us         4.13%      58.732us       9.789us       9.631us         7.95%       9.631us       1.605us             6  
-                                Activity Buffer Request        16.91%     240.489us        16.91%     240.489us     240.489us       1.856us         1.53%       1.856us       1.856us             1  
-                                    aten::empty_strided         2.06%      29.230us         2.06%      29.230us       4.872us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        15.93%     226.498us        15.93%     226.498us      37.750us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.75%      67.473us         6.05%      86.070us       3.586us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.31%      18.597us         1.31%      18.597us       0.775us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        15.17%     215.654us        15.17%     215.654us       4.493us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.35%       4.980us         0.35%       4.980us       4.980us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     927.996us       768.63%     927.996us     927.996us             1  
+                                            torch_eager        20.13%     284.369us        99.65%       1.408ms       1.408ms       0.000us         0.00%     122.557us     122.557us             1  
+                                              aten::mul        10.77%     152.163us        18.72%     264.405us      11.017us      62.048us        51.39%      62.048us       2.585us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      62.048us        51.39%      62.048us       2.585us            24  
+                                            aten::copy_         7.56%     106.823us        43.43%     613.475us      34.082us      39.390us        32.63%      41.213us       2.290us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.864us        23.91%      28.864us       2.405us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.296us        15.98%      19.296us       1.608us            12  
+                                            aten::clone         1.39%      19.620us        37.04%     523.281us      87.213us       0.000us         0.00%      12.349us       2.058us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.526us         8.72%      10.526us       1.754us             6  
+                                              aten::add         2.28%      32.232us         3.86%      54.523us       9.087us       9.696us         8.03%       9.696us       1.616us             6  
+                                              aten::sub         2.48%      35.082us         4.10%      57.982us       9.664us       9.600us         7.95%       9.600us       1.600us             6  
+                                Activity Buffer Request        14.96%     211.375us        14.96%     211.375us     211.375us       1.823us         1.51%       1.823us       1.823us             1  
+                                    aten::empty_strided         2.07%      29.290us         2.07%      29.290us       4.882us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        16.27%     229.815us        16.27%     229.815us      38.302us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.68%      66.168us         5.95%      84.051us       3.502us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.27%      17.883us         1.27%      17.883us       0.745us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.78%     222.895us        15.78%     222.895us       4.644us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.35%       4.970us         0.35%       4.970us       4.970us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.422ms
-Self CUDA time total: 121.153us
+Self CPU time total: 1.413ms
+Self CUDA time total: 120.734us
 
 
 
@@ -4269,27 +4269,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     918.443us       533.10%     918.443us     918.443us             1  
-                                            torch_eager        20.03%     279.953us        99.65%       1.393ms       1.393ms       0.000us         0.00%     175.133us     175.133us             1  
-                                              aten::mul        10.59%     147.997us        18.47%     258.229us      10.760us      89.472us        51.93%      89.472us       3.728us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      89.472us        51.93%      89.472us       3.728us            24  
-                                            aten::copy_         7.43%     103.844us        43.15%     603.182us      33.510us      57.887us        33.60%      60.735us       3.374us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.831us        23.70%      40.831us       3.403us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.926us        14.47%      24.926us       2.077us            12  
-                                            aten::clone         1.45%      20.289us        37.34%     521.998us      87.000us       0.000us         0.00%      19.904us       3.317us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.056us         9.90%      17.056us       2.843us             6  
-                                              aten::add         2.21%      30.953us         3.79%      53.002us       8.834us      12.480us         7.24%      12.480us       2.080us             6  
-                                              aten::sub         2.40%      33.491us         4.09%      57.142us       9.524us      12.446us         7.22%      12.446us       2.074us             6  
-                                Activity Buffer Request        14.98%     209.468us        14.98%     209.468us     209.468us       2.848us         1.65%       2.848us       2.848us             1  
-                                    aten::empty_strided         2.03%      28.380us         2.03%      28.380us       4.730us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        16.36%     228.728us        16.36%     228.728us      38.121us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         5.25%      73.370us         6.64%      92.881us       3.870us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.40%      19.511us         1.40%      19.511us       0.813us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        15.53%     217.074us        15.53%     217.074us       4.522us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.35%       4.950us         0.35%       4.950us       4.950us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     941.367us       547.21%     941.367us     941.367us             1  
+                                            torch_eager        19.36%     280.543us        99.66%       1.444ms       1.444ms       0.000us         0.00%     174.877us     174.877us             1  
+                                              aten::mul        10.67%     154.592us        18.48%     267.677us      11.153us      89.535us        52.05%      89.535us       3.731us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      89.535us        52.05%      89.535us       3.731us            24  
+                                            aten::copy_         7.38%     106.934us        44.27%     641.329us      35.629us      57.694us        33.54%      60.542us       3.363us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.701us        23.66%      40.701us       3.392us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.800us        14.42%      24.800us       2.067us            12  
+                                            aten::clone         1.44%      20.830us        37.97%     550.103us      91.684us       0.000us         0.00%      19.841us       3.307us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.993us         9.88%      16.993us       2.832us             6  
+                                              aten::add         2.36%      34.121us         3.90%      56.522us       9.420us      12.448us         7.24%      12.448us       2.075us             6  
+                                              aten::sub         2.56%      37.161us         4.27%      61.881us      10.313us      12.352us         7.18%      12.352us       2.059us             6  
+                                Activity Buffer Request        16.20%     234.686us        16.20%     234.686us     234.686us       2.848us         1.66%       2.848us       2.848us             1  
+                                    aten::empty_strided         2.02%      29.270us         2.02%      29.270us       4.878us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        15.95%     231.027us        15.95%     231.027us      38.505us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.63%      67.091us         5.92%      85.764us       3.573us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.29%      18.673us         1.29%      18.673us       0.778us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.80%     228.888us        15.80%     228.888us       4.768us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.34%       4.980us         0.34%       4.980us       4.980us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.398ms
-Self CUDA time total: 172.285us
+Self CPU time total: 1.449ms
+Self CUDA time total: 172.029us
 
 
 
@@ -4299,27 +4299,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     945.822us       332.63%     945.822us     945.822us             1  
-                                            torch_eager        11.69%     314.391us        99.81%       2.685ms       2.685ms       0.000us         0.00%     302.941us     302.941us             1  
-                                              aten::mul         5.41%     145.454us         9.45%     254.127us      10.589us     133.310us        46.88%     133.310us       5.555us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     133.310us        46.88%     133.310us       5.555us            24  
-                                            aten::copy_         4.13%     111.027us        68.93%       1.854ms     103.002us     109.662us        38.57%     128.254us       7.125us            18  
-                                            aten::clone         1.07%      28.661us        65.93%       1.773ms     295.570us       0.000us         0.00%      70.912us      11.819us             6  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.342us        20.17%      57.342us       4.779us            12  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      52.320us        18.40%      52.320us       8.720us             6  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      41.377us        14.55%      41.377us       3.448us            12  
-                                              aten::sub         1.27%      34.091us         2.15%      57.911us       9.652us      20.704us         7.28%      20.704us       3.451us             6  
-                                              aten::add         1.22%      32.950us         2.07%      55.610us       9.268us      20.673us         7.27%      20.673us       3.446us             6  
-                                Activity Buffer Request        54.12%       1.456ms        54.12%       1.456ms       1.456ms      18.592us         6.54%      18.592us      18.592us             1  
-                                    aten::empty_strided         1.18%      31.741us         1.18%      31.741us       5.290us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         8.32%     223.797us         8.32%     223.797us      37.300us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.55%      68.485us         3.28%      88.267us       3.678us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.74%      19.782us         0.74%      19.782us       0.824us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.13%     218.664us         8.13%     218.664us       4.555us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.19%       5.100us         0.19%       5.100us       5.100us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     950.141us       334.64%     950.141us     950.141us             1  
+                                            torch_eager        11.47%     310.562us        99.82%       2.702ms       2.702ms       0.000us         0.00%     302.012us     302.012us             1  
+                                              aten::mul         5.57%     150.802us         9.64%     260.955us      10.873us     133.822us        47.13%     133.822us       5.576us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     133.822us        47.13%     133.822us       5.576us            24  
+                                            aten::copy_         3.88%     105.155us        69.00%       1.868ms     103.782us     109.151us        38.44%     127.231us       7.068us            18  
+                                            aten::clone         0.99%      26.749us        66.03%       1.788ms     297.926us       0.000us         0.00%      69.886us      11.648us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.345us        20.20%      57.345us       4.779us            12  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      51.806us        18.25%      51.806us       8.634us             6  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      40.959us        14.43%      40.959us       3.413us            12  
+                                              aten::sub         1.29%      34.831us         2.15%      58.172us       9.695us      20.607us         7.26%      20.607us       3.435us             6  
+                                              aten::add         1.26%      34.242us         2.11%      57.104us       9.517us      20.352us         7.17%      20.352us       3.392us             6  
+                                Activity Buffer Request        54.34%       1.471ms        54.34%       1.471ms       1.471ms      18.080us         6.37%      18.080us      18.080us             1  
+                                    aten::empty_strided         1.13%      30.492us         1.13%      30.492us       5.082us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.33%     225.535us         8.33%     225.535us      37.589us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.63%      71.143us         3.33%      90.164us       3.757us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.70%      19.021us         0.70%      19.021us       0.793us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.22%     222.598us         8.22%     222.598us       4.637us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.18%       4.920us         0.18%       4.920us       4.920us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.690ms
-Self CUDA time total: 284.349us
+Self CPU time total: 2.707ms
+Self CUDA time total: 283.932us
 
 
 
@@ -4329,27 +4329,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     938.033us       165.64%     938.033us     938.033us             1  
-                                            torch_eager        20.89%     291.484us        99.63%       1.390ms       1.390ms       0.000us         0.00%     590.004us     590.004us             1  
-                                            aten::copy_         7.34%     102.395us        41.53%     579.320us      32.184us     273.370us        48.27%     297.081us      16.504us            18  
-                                              aten::mul        10.73%     149.623us        18.75%     261.638us      10.902us     225.916us        39.89%     225.916us       9.413us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     225.916us        39.89%     225.916us       9.413us            24  
-                                            aten::clone         1.46%      20.369us        35.71%     498.147us      83.025us       0.000us         0.00%     206.459us      34.410us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     182.748us        32.27%     182.748us      30.458us             6  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      90.622us        16.00%      90.622us       7.552us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      67.007us        11.83%      67.007us       5.584us            12  
-                                              aten::sub         2.52%      35.222us         4.78%      66.682us      11.114us      34.272us         6.05%      34.272us       5.712us             6  
-                                              aten::add         2.30%      32.121us         4.02%      56.063us       9.344us      32.735us         5.78%      32.735us       5.456us             6  
-                                Activity Buffer Request        14.16%     197.506us        14.16%     197.506us     197.506us      23.711us         4.19%      23.711us      23.711us             1  
-                                    aten::empty_strided         2.10%      29.332us         2.10%      29.332us       4.889us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        15.61%     217.828us        15.61%     217.828us      36.305us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.72%      65.792us         6.10%      85.041us       3.543us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.38%      19.249us         1.38%      19.249us       0.802us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        16.42%     229.008us        16.42%     229.008us       4.771us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.37%       5.150us         0.37%       5.150us       5.150us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     966.098us       169.87%     966.098us     966.098us             1  
+                                            torch_eager        20.40%     290.715us        99.64%       1.420ms       1.420ms       0.000us         0.00%     592.377us     592.377us             1  
+                                            aten::copy_         7.41%     105.615us        41.73%     594.574us      33.032us     275.293us        48.40%     298.941us      16.608us            18  
+                                              aten::mul        10.90%     155.244us        18.92%     269.648us      11.235us     227.071us        39.93%     227.071us       9.461us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     227.071us        39.93%     227.071us       9.461us            24  
+                                            aten::clone         1.44%      20.483us        35.30%     502.923us      83.821us       0.000us         0.00%     207.134us      34.522us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     183.486us        32.26%     183.486us      30.581us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      91.807us        16.14%      91.807us       7.651us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      66.365us        11.67%      66.365us       5.530us            12  
+                                              aten::sub         2.66%      37.929us         4.43%      63.131us      10.522us      33.790us         5.94%      33.790us       5.632us             6  
+                                              aten::add         2.47%      35.251us         4.15%      59.172us       9.862us      32.575us         5.73%      32.575us       5.429us             6  
+                                Activity Buffer Request        13.81%     196.814us        13.81%     196.814us     196.814us      23.648us         4.16%      23.648us      23.648us             1  
+                                    aten::empty_strided         2.02%      28.790us         2.02%      28.790us       4.798us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        15.63%     222.685us        15.63%     222.685us      37.114us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         5.20%      74.092us         6.55%      93.282us       3.887us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.35%      19.190us         1.35%      19.190us       0.800us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        16.35%     232.987us        16.35%     232.987us       4.854us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.36%       5.080us         0.36%       5.080us       5.080us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.395ms
-Self CUDA time total: 566.293us
+Self CPU time total: 1.425ms
+Self CUDA time total: 568.729us
 
 
 
@@ -4359,27 +4359,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     912.211us       984.01%     912.211us     912.211us             1  
-                                            torch_eager        20.74%     286.708us        99.62%       1.377ms       1.377ms       0.000us         0.00%      93.855us      93.855us             1  
-                                              aten::mul        10.48%     144.890us        18.31%     253.080us      10.545us      49.856us        53.78%      49.856us       2.077us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      49.856us        53.78%      49.856us       2.077us            24  
-                                            aten::copy_         7.33%     101.333us        42.51%     587.542us      32.641us      29.407us        31.72%      30.559us       1.698us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.623us        24.40%      22.623us       1.885us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.440us        14.50%      13.440us       1.120us            12  
-                                            aten::clone         1.54%      21.251us        36.76%     508.068us      84.678us       0.000us         0.00%       7.936us       1.323us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.784us         7.32%       6.784us       1.131us             6  
-                                              aten::sub         2.53%      34.908us         4.26%      58.910us       9.818us       6.720us         7.25%       6.720us       1.120us             6  
-                                              aten::add         2.34%      32.341us         3.97%      54.832us       9.139us       6.720us         7.25%       6.720us       1.120us             6  
-                                Activity Buffer Request        14.89%     205.787us        14.89%     205.787us     205.787us       1.152us         1.24%       1.152us       1.152us             1  
-                                    aten::empty_strided         2.09%      28.901us         2.09%      28.901us       4.817us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        15.89%     219.618us        15.89%     219.618us      36.603us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.84%      66.885us         6.21%      85.845us       3.577us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.37%      18.960us         1.37%      18.960us       0.790us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        15.59%     215.487us        15.59%     215.487us       4.489us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.38%       5.210us         0.38%       5.210us       5.210us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     975.032us      1053.20%     975.032us     975.032us             1  
+                                            torch_eager        19.78%     289.798us        99.66%       1.460ms       1.460ms       0.000us         0.00%      93.698us      93.698us             1  
+                                              aten::mul        11.08%     162.260us        19.21%     281.475us      11.728us      49.665us        53.65%      49.665us       2.069us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      49.665us        53.65%      49.665us       2.069us            24  
+                                            aten::copy_         7.16%     104.830us        42.02%     615.673us      34.204us      29.441us        31.80%      30.561us       1.698us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.657us        24.47%      22.657us       1.888us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.472us        14.55%      13.472us       1.123us            12  
+                                            aten::clone         1.39%      20.311us        36.25%     531.032us      88.505us       0.000us         0.00%       7.904us       1.317us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.784us         7.33%       6.784us       1.131us             6  
+                                              aten::add         2.30%      33.730us         3.98%      58.302us       9.717us       6.752us         7.29%       6.752us       1.125us             6  
+                                              aten::sub         2.57%      37.640us         4.45%      65.262us      10.877us       6.720us         7.26%       6.720us       1.120us             6  
+                                Activity Buffer Request        14.75%     216.135us        14.75%     216.135us     216.135us       1.120us         1.21%       1.120us       1.120us             1  
+                                    aten::empty_strided         2.59%      37.931us         2.59%      37.931us       6.322us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        15.29%     223.986us        15.29%     223.986us      37.331us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.89%      71.623us         6.23%      91.274us       3.803us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.34%      19.651us         1.34%      19.651us       0.819us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        16.53%     242.131us        16.53%     242.131us       5.044us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.34%       5.040us         0.34%       5.040us       5.040us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.382ms
-Self CUDA time total: 92.703us
+Self CPU time total: 1.465ms
+Self CUDA time total: 92.578us
 
 
 
@@ -4389,27 +4389,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     938.901us       973.14%     938.901us     938.901us             1  
-                                            torch_eager        11.77%     313.313us        99.82%       2.656ms       2.656ms       0.000us         0.00%      97.825us      97.825us             1  
-                                              aten::mul         5.60%     148.957us         9.78%     260.340us      10.847us      51.266us        53.14%      51.266us       2.136us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      51.266us        53.14%      51.266us       2.136us            24  
-                                            aten::copy_         3.87%     103.023us        68.29%       1.817ms     100.957us      30.976us        32.11%      32.319us       1.795us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      23.072us        23.91%      23.072us       1.923us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.240us        14.76%      14.240us       1.187us            12  
-                                            aten::clone         1.07%      28.429us        65.69%       1.748ms     291.327us       0.000us         0.00%       9.247us       1.541us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.904us         8.19%       7.904us       1.317us             6  
-                                              aten::add         1.24%      33.110us         2.10%      56.011us       9.335us       7.137us         7.40%       7.137us       1.189us             6  
-                                              aten::sub         1.37%      36.490us         2.25%      59.790us       9.965us       7.103us         7.36%       7.103us       1.184us             6  
-                                Activity Buffer Request        53.84%       1.433ms        53.84%       1.433ms       1.433ms       1.343us         1.39%       1.343us       1.343us             1  
-                                    aten::empty_strided         1.19%      31.751us         1.19%      31.751us       5.292us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         8.25%     219.470us         8.25%     219.470us      36.578us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.63%      69.934us         3.35%      89.134us       3.714us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.72%      19.200us         0.72%      19.200us       0.800us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.25%     219.576us         8.25%     219.576us       4.574us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.18%       4.910us         0.18%       4.910us       4.910us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     963.643us      1001.81%     963.643us     963.643us             1  
+                                            torch_eager        11.60%     311.071us        99.82%       2.676ms       2.676ms       0.000us         0.00%      97.534us      97.534us             1  
+                                              aten::mul         5.66%     151.593us        10.00%     268.127us      11.172us      51.103us        53.13%      51.103us       2.129us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      51.103us        53.13%      51.103us       2.129us            24  
+                                            aten::copy_         3.93%     105.441us        68.13%       1.826ms     101.459us      30.911us        32.14%      32.255us       1.792us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      23.007us        23.92%      23.007us       1.917us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.176us        14.74%      14.176us       1.181us            12  
+                                            aten::clone         1.04%      27.830us        65.21%       1.748ms     291.325us       0.000us         0.00%       9.248us       1.541us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.904us         8.22%       7.904us       1.317us             6  
+                                              aten::sub         1.38%      37.040us         2.30%      61.581us      10.264us       7.103us         7.38%       7.103us       1.184us             6  
+                                              aten::add         1.19%      32.000us         2.05%      54.860us       9.143us       7.073us         7.35%       7.073us       1.179us             6  
+                                Activity Buffer Request        53.57%       1.436ms        53.57%       1.436ms       1.436ms       1.344us         1.40%       1.344us       1.344us             1  
+                                    aten::empty_strided         1.19%      31.921us         1.19%      31.921us       5.320us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.14%     218.236us         8.14%     218.236us      36.373us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.76%      74.059us         3.52%      94.290us       3.929us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.75%      20.231us         0.75%      20.231us       0.843us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.60%     230.408us         8.60%     230.408us       4.800us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.18%       4.700us         0.18%       4.700us       4.700us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.661ms
-Self CUDA time total: 96.482us
+Self CPU time total: 2.681ms
+Self CUDA time total: 96.190us
 
 
 
@@ -4419,27 +4419,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     932.446us       897.69%     932.446us     932.446us             1  
-                                            torch_eager        11.60%     307.685us        99.81%       2.647ms       2.647ms       0.000us         0.00%     105.184us     105.184us             1  
-                                              aten::mul         5.51%     146.123us         9.64%     255.679us      10.653us      55.362us        53.30%      55.362us       2.307us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      55.362us        53.30%      55.362us       2.307us            24  
-                                            aten::copy_         3.78%     100.194us        68.64%       1.821ms     101.144us      32.478us        31.27%      33.790us       1.877us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.703us        23.78%      24.703us       2.059us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.032us        15.43%      16.032us       1.336us            12  
-                                            aten::clone         1.02%      27.179us        65.92%       1.748ms     291.378us       0.000us         0.00%       9.087us       1.515us             6  
-                                              aten::add         1.19%      31.489us         2.03%      53.840us       8.973us       8.064us         7.76%       8.064us       1.344us             6  
-                                              aten::sub         1.35%      35.692us         2.26%      59.843us       9.974us       7.968us         7.67%       7.968us       1.328us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.775us         7.49%       7.775us       1.296us             6  
-                                Activity Buffer Request        54.18%       1.437ms        54.18%       1.437ms       1.437ms       1.312us         1.26%       1.312us       1.312us             1  
-                                    aten::empty_strided         1.21%      32.003us         1.21%      32.003us       5.334us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         8.25%     218.717us         8.25%     218.717us      36.453us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.67%      70.760us         3.41%      90.371us       3.765us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.74%      19.611us         0.74%      19.611us       0.817us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.32%     220.800us         8.32%     220.800us       4.600us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.19%       5.070us         0.19%       5.070us       5.070us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     984.120us       950.08%     984.120us     984.120us             1  
+                                            torch_eager        21.32%     307.609us        99.66%       1.438ms       1.438ms       0.000us         0.00%     104.863us     104.863us             1  
+                                              aten::mul        11.11%     160.241us        19.03%     274.535us      11.439us      55.232us        53.32%      55.232us       2.301us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      55.232us        53.32%      55.232us       2.301us            24  
+                                            aten::copy_         7.56%     109.063us        40.34%     581.983us      32.332us      32.383us        31.26%      33.663us       1.870us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.639us        23.79%      24.639us       2.053us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      15.968us        15.42%      15.968us       1.331us            12  
+                                            aten::clone         1.50%      21.672us        34.18%     493.044us      82.174us       0.000us         0.00%       9.024us       1.504us             6  
+                                              aten::add         2.60%      37.520us         4.33%      62.511us      10.418us       8.031us         7.75%       8.031us       1.339us             6  
+                                              aten::sub         2.72%      39.231us         4.56%      65.841us      10.973us       7.937us         7.66%       7.937us       1.323us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.744us         7.48%       7.744us       1.291us             6  
+                                Activity Buffer Request        13.05%     188.244us        13.05%     188.244us     188.244us       1.280us         1.24%       1.280us       1.280us             1  
+                                    aten::empty_strided         2.28%      32.882us         2.28%      32.882us       5.480us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        14.94%     215.555us        14.94%     215.555us      35.926us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.93%      71.162us         6.28%      90.612us       3.776us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.35%      19.450us         1.35%      19.450us       0.810us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        16.29%     235.016us        16.29%     235.016us       4.896us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.34%       4.880us         0.34%       4.880us       4.880us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.652ms
-Self CUDA time total: 103.872us
+Self CPU time total: 1.443ms
+Self CUDA time total: 103.583us
 
 
 
@@ -4449,27 +4449,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     914.130us       736.81%     914.130us     914.130us             1  
-                                            torch_eager        19.76%     284.015us        99.65%       1.432ms       1.432ms       0.000us         0.00%     125.858us     125.858us             1  
-                                              aten::mul        10.20%     146.586us        17.70%     254.419us      10.601us      65.313us        52.64%      65.313us       2.721us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      65.313us        52.64%      65.313us       2.721us            24  
-                                            aten::copy_         7.71%     110.793us        44.82%     644.172us      35.787us      39.489us        31.83%      41.281us       2.293us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.961us        23.34%      28.961us       2.413us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.264us        15.53%      19.264us       1.605us            12  
-                                            aten::clone         1.45%      20.820us        39.14%     562.560us      93.760us       0.000us         0.00%      12.320us       2.053us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.528us         8.49%      10.528us       1.755us             6  
-                                              aten::add         2.34%      33.572us         3.91%      56.142us       9.357us       9.664us         7.79%       9.664us       1.611us             6  
-                                              aten::sub         2.40%      34.530us         4.02%      57.751us       9.625us       9.600us         7.74%       9.600us       1.600us             6  
-                                Activity Buffer Request        17.82%     256.078us        17.82%     256.078us     256.078us       1.792us         1.44%       1.792us       1.792us             1  
-                                    aten::empty_strided         2.04%      29.262us         2.04%      29.262us       4.877us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        14.99%     215.437us        14.99%     215.437us      35.906us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.63%      66.508us         5.96%      85.660us       3.569us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.33%      19.152us         1.33%      19.152us       0.798us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        14.99%     215.488us        14.99%     215.488us       4.489us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.35%       5.000us         0.35%       5.000us       5.000us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     935.122us       757.84%     935.122us     935.122us             1  
+                                            torch_eager        19.99%     283.519us        99.60%       1.412ms       1.412ms       0.000us         0.00%     125.153us     125.153us             1  
+                                              aten::mul        10.97%     155.634us        18.77%     266.135us      11.089us      65.024us        52.70%      65.024us       2.709us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      65.024us        52.70%      65.024us       2.709us            24  
+                                            aten::copy_         7.53%     106.809us        43.10%     611.203us      33.956us      39.201us        31.77%      40.961us       2.276us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.769us        23.31%      28.769us       2.397us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.168us        15.53%      19.168us       1.597us            12  
+                                            aten::clone         1.50%      21.262us        37.00%     524.722us      87.454us       0.000us         0.00%      12.192us       2.032us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.432us         8.45%      10.432us       1.739us             6  
+                                              aten::add         2.41%      34.151us         3.94%      55.922us       9.320us       9.664us         7.83%       9.664us       1.611us             6  
+                                              aten::sub         2.49%      35.371us         4.21%      59.711us       9.952us       9.504us         7.70%       9.504us       1.584us             6  
+                                Activity Buffer Request        14.55%     206.375us        14.55%     206.375us     206.375us       1.760us         1.43%       1.760us       1.760us             1  
+                                    aten::empty_strided         2.12%      30.049us         2.12%      30.049us       5.008us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        16.20%     229.735us        16.20%     229.735us      38.289us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.63%      65.693us         5.97%      84.623us       3.526us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.33%      18.930us         1.33%      18.930us       0.789us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.86%     224.896us        15.86%     224.896us       4.685us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.40%       5.729us         0.40%       5.729us       5.729us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.437ms
-Self CUDA time total: 124.066us
+Self CPU time total: 1.418ms
+Self CUDA time total: 123.393us
 
 
 
@@ -4479,27 +4479,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     921.138us       886.26%     921.138us     921.138us             1  
-                                            torch_eager        20.59%     281.307us        99.64%       1.361ms       1.361ms       0.000us         0.00%     105.280us     105.280us             1  
-                                              aten::mul        10.84%     148.087us        18.91%     258.361us      10.765us      55.487us        53.39%      55.487us       2.312us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      55.487us        53.39%      55.487us       2.312us            24  
-                                            aten::copy_         7.39%     100.946us        41.35%     564.842us      31.380us      32.481us        31.25%      33.825us       1.879us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.640us        23.71%      24.640us       2.053us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      15.968us        15.36%      15.968us       1.331us            12  
-                                            aten::clone         1.54%      21.041us        35.66%     487.118us      81.186us       0.000us         0.00%       9.185us       1.531us             6  
-                                              aten::sub         2.75%      37.531us         4.47%      61.012us      10.169us       8.031us         7.73%       8.031us       1.339us             6  
-                                              aten::add         2.35%      32.112us         3.97%      54.222us       9.037us       7.937us         7.64%       7.937us       1.323us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.841us         7.54%       7.841us       1.307us             6  
-                                Activity Buffer Request        13.62%     186.046us        13.62%     186.046us     186.046us       1.344us         1.29%       1.344us       1.344us             1  
-                                    aten::empty_strided         2.20%      30.110us         2.20%      30.110us       5.018us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        15.76%     215.337us        15.76%     215.337us      35.890us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         5.18%      70.704us         6.60%      90.193us       3.758us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.43%      19.489us         1.43%      19.489us       0.812us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        15.99%     218.378us        15.99%     218.378us       4.550us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.36%       4.960us         0.36%       4.960us       4.960us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     964.101us       931.86%     964.101us     964.101us             1  
+                                            torch_eager        11.58%     311.269us        99.80%       2.682ms       2.682ms       0.000us         0.00%     104.772us     104.772us             1  
+                                              aten::mul         5.74%     154.165us         9.94%     267.067us      11.128us      55.236us        53.39%      55.236us       2.301us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      55.236us        53.39%      55.236us       2.301us            24  
+                                            aten::copy_         4.07%     109.351us        68.30%       1.836ms     101.989us      32.287us        31.21%      33.599us       1.867us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.511us        23.69%      24.511us       2.043us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      15.937us        15.40%      15.937us       1.328us            12  
+                                            aten::clone         1.02%      27.532us        65.06%       1.749ms     291.482us       0.000us         0.00%       9.088us       1.515us             6  
+                                              aten::add         1.31%      35.310us         2.20%      59.141us       9.857us       7.969us         7.70%       7.969us       1.328us             6  
+                                              aten::sub         1.38%      37.131us         2.33%      62.602us      10.434us       7.968us         7.70%       7.968us       1.328us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.776us         7.52%       7.776us       1.296us             6  
+                                Activity Buffer Request        53.54%       1.439ms        53.54%       1.439ms       1.439ms       1.312us         1.27%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.12%      30.190us         1.12%      30.190us       5.032us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.09%     217.335us         8.09%     217.335us      36.223us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.62%      70.291us         3.31%      88.901us       3.704us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.69%      18.610us         0.69%      18.610us       0.775us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.64%     232.137us         8.64%     232.137us       4.836us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.20%       5.481us         0.20%       5.481us       5.481us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.366ms
-Self CUDA time total: 103.936us
+Self CPU time total: 2.688ms
+Self CUDA time total: 103.460us
 
 
 
@@ -4509,27 +4509,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     943.466us       759.69%     943.466us     943.466us             1  
-                                            torch_eager        21.73%     302.071us        99.63%       1.385ms       1.385ms       0.000us         0.00%     125.950us     125.950us             1  
-                                              aten::mul        10.55%     146.657us        18.63%     259.039us      10.793us      65.378us        52.64%      65.378us       2.724us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      65.378us        52.64%      65.378us       2.724us            24  
-                                            aten::copy_         7.63%     106.103us        41.12%     571.631us      31.757us      39.519us        31.82%      41.278us       2.293us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      29.024us        23.37%      29.024us       2.419us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.294us        15.54%      19.294us       1.608us            12  
-                                            aten::clone         1.52%      21.080us        35.11%     488.057us      81.343us       0.000us         0.00%      12.254us       2.042us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.495us         8.45%      10.495us       1.749us             6  
-                                              aten::sub         2.46%      34.153us         4.15%      57.634us       9.606us       9.727us         7.83%       9.727us       1.621us             6  
-                                              aten::add         2.41%      33.450us         4.05%      56.342us       9.390us       9.567us         7.70%       9.567us       1.595us             6  
-                                Activity Buffer Request        13.70%     190.466us        13.70%     190.466us     190.466us       1.759us         1.42%       1.759us       1.759us             1  
-                                    aten::empty_strided         2.14%      29.791us         2.14%      29.791us       4.965us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        15.29%     212.610us        15.29%     212.610us      35.435us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.88%      67.802us         6.29%      87.511us       3.646us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.42%      19.709us         1.42%      19.709us       0.821us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        15.91%     221.207us        15.91%     221.207us       4.608us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.37%       5.080us         0.37%       5.080us       5.080us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     964.072us       780.68%     964.072us     964.072us             1  
+                                            torch_eager        11.45%     316.268us        99.81%       2.758ms       2.758ms       0.000us         0.00%     125.283us     125.283us             1  
+                                              aten::mul         5.46%     150.776us         9.46%     261.336us      10.889us      65.090us        52.71%      65.090us       2.712us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      65.090us        52.71%      65.090us       2.712us            24  
+                                            aten::copy_         3.85%     106.511us        68.83%       1.902ms     105.647us      39.266us        31.80%      41.058us       2.281us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.802us        23.32%      28.802us       2.400us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.135us        15.50%      19.135us       1.595us            12  
+                                            aten::clone         1.09%      30.231us        66.11%       1.827ms     304.441us       0.000us         0.00%      12.256us       2.043us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.464us         8.47%      10.464us       1.744us             6  
+                                              aten::add         1.22%      33.650us         2.08%      57.431us       9.572us       9.599us         7.77%       9.599us       1.600us             6  
+                                              aten::sub         1.35%      37.292us         2.48%      68.652us      11.442us       9.536us         7.72%       9.536us       1.589us             6  
+                                Activity Buffer Request        54.53%       1.507ms        54.53%       1.507ms       1.507ms       1.792us         1.45%       1.792us       1.792us             1  
+                                    aten::empty_strided         1.19%      32.821us         1.19%      32.821us       5.470us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.01%     221.424us         8.01%     221.424us      36.904us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.55%      70.592us         3.23%      89.363us       3.723us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.68%      18.771us         0.68%      18.771us       0.782us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.42%     232.664us         8.42%     232.664us       4.847us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.19%       5.190us         0.19%       5.190us       5.190us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.390ms
-Self CUDA time total: 124.191us
+Self CPU time total: 2.763ms
+Self CUDA time total: 123.491us
 
 
 
@@ -4539,27 +4539,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     909.497us       512.75%     909.497us     909.497us             1  
-                                            torch_eager        20.85%     278.298us        99.63%       1.330ms       1.330ms       0.000us         0.00%     180.288us     180.288us             1  
-                                              aten::mul        10.86%     144.977us        19.10%     254.920us      10.622us      94.591us        53.33%      94.591us       3.941us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      94.591us        53.33%      94.591us       3.941us            24  
-                                            aten::copy_         7.76%     103.603us        40.90%     545.870us      30.326us      57.919us        32.65%      60.831us       3.380us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.767us        22.98%      40.767us       3.397us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.866us        14.02%      24.866us       2.072us            12  
-                                            aten::clone         1.59%      21.200us        34.96%     466.526us      77.754us       0.000us         0.00%      20.064us       3.344us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.152us         9.67%      17.152us       2.859us             6  
-                                              aten::sub         2.64%      35.242us         4.38%      58.452us       9.742us      12.450us         7.02%      12.450us       2.075us             6  
-                                              aten::add         2.38%      31.821us         4.13%      55.081us       9.180us      12.416us         7.00%      12.416us       2.069us             6  
-                                Activity Buffer Request        12.93%     172.606us        12.93%     172.606us     172.606us       2.912us         1.64%       2.912us       2.912us             1  
-                                    aten::empty_strided         2.27%      30.341us         2.27%      30.341us       5.057us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        15.64%     208.798us        15.64%     208.798us      34.800us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.99%      66.616us         6.40%      85.475us       3.561us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.41%      18.859us         1.41%      18.859us       0.786us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        16.28%     217.276us        16.28%     217.276us       4.527us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.37%       5.001us         0.37%       5.001us       5.001us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     934.855us       527.63%     934.855us     934.855us             1  
+                                            torch_eager        19.51%     283.728us        99.66%       1.450ms       1.450ms       0.000us         0.00%     180.061us     180.061us             1  
+                                              aten::mul        10.43%     151.748us        18.10%     263.338us      10.972us      95.007us        53.62%      95.007us       3.959us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      95.007us        53.62%      95.007us       3.959us            24  
+                                            aten::copy_         7.11%     103.461us        44.35%     645.065us      35.837us      57.664us        32.55%      60.544us       3.364us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.608us        22.92%      40.608us       3.384us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.510us        13.83%      24.510us       2.042us            12  
+                                            aten::clone         1.46%      21.280us        38.39%     558.424us      93.071us       0.000us         0.00%      19.936us       3.323us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.056us         9.63%      17.056us       2.843us             6  
+                                              aten::add         2.36%      34.271us         3.99%      58.001us       9.667us      12.287us         6.93%      12.287us       2.048us             6  
+                                              aten::sub         2.55%      37.161us         4.24%      61.641us      10.274us      12.223us         6.90%      12.223us       2.037us             6  
+                                Activity Buffer Request        17.53%     255.006us        17.53%     255.006us     255.006us       2.880us         1.63%       2.880us       2.880us             1  
+                                    aten::empty_strided         2.02%      29.311us         2.02%      29.311us       4.885us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        15.21%     221.267us        15.21%     221.267us      36.878us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.73%      68.750us         6.01%      87.372us       3.641us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.28%      18.622us         1.28%      18.622us       0.776us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.48%     225.131us        15.48%     225.131us       4.690us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.34%       4.880us         0.34%       4.880us       4.880us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.335ms
-Self CUDA time total: 177.376us
+Self CPU time total: 1.455ms
+Self CUDA time total: 177.181us
 
 
 
@@ -4569,27 +4569,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     908.914us       305.78%     908.914us     908.914us             1  
-                                            torch_eager        20.55%     283.527us        99.64%       1.375ms       1.375ms       0.000us         0.00%     314.296us     314.296us             1  
-                                              aten::mul        10.61%     146.340us        18.54%     255.803us      10.658us     145.086us        48.81%     145.086us       6.045us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     145.086us        48.81%     145.086us       6.045us            24  
-                                            aten::copy_         7.34%     101.324us        42.67%     588.790us      32.711us     111.099us        37.38%     128.154us       7.120us            18  
-                                            aten::clone         1.50%      20.722us        37.09%     511.699us      85.283us       0.000us         0.00%      70.718us      11.786us             6  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.436us        19.32%      57.436us       4.786us            12  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.663us        18.05%      53.663us       8.944us             6  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      41.056us        13.81%      41.056us       3.421us            12  
-                                              aten::sub         2.49%      34.330us         4.16%      57.351us       9.558us      20.672us         6.95%      20.672us       3.445us             6  
-                                              aten::add         2.29%      31.611us         3.89%      53.723us       8.954us      20.384us         6.86%      20.384us       3.397us             6  
-                                Activity Buffer Request        15.84%     218.487us        15.84%     218.487us     218.487us      17.055us         5.74%      17.055us      17.055us             1  
-                                    aten::empty_strided         2.18%      30.110us         2.18%      30.110us       5.018us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        15.10%     208.357us        15.10%     208.357us      34.726us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.74%      65.442us         6.15%      84.803us       3.533us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.40%      19.361us         1.40%      19.361us       0.807us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        15.60%     215.218us        15.60%     215.218us       4.484us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.36%       4.930us         0.36%       4.930us       4.930us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     936.902us       314.34%     936.902us     936.902us             1  
+                                            torch_eager        19.95%     279.505us        99.63%       1.396ms       1.396ms       0.000us         0.00%     315.267us     315.267us             1  
+                                              aten::mul        10.85%     152.079us        18.94%     265.395us      11.058us     146.176us        49.04%     146.176us       6.091us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     146.176us        49.04%     146.176us       6.091us            24  
+                                            aten::copy_         7.66%     107.385us        42.60%     596.937us      33.163us     110.978us        37.23%     128.194us       7.122us            18  
+                                            aten::clone         1.45%      20.319us        36.31%     508.783us      84.797us       0.000us         0.00%      70.625us      11.771us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.569us        19.32%      57.569us       4.797us            12  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.409us        17.92%      53.409us       8.902us             6  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      40.897us        13.72%      40.897us       3.408us            12  
+                                              aten::sub         2.61%      36.531us         4.38%      61.402us      10.234us      20.449us         6.86%      20.449us       3.408us             6  
+                                              aten::add         2.39%      33.533us         3.98%      55.753us       9.292us      20.448us         6.86%      20.448us       3.408us             6  
+                                Activity Buffer Request        14.75%     206.705us        14.75%     206.705us     206.705us      17.216us         5.78%      17.216us      17.216us             1  
+                                    aten::empty_strided         2.13%      29.842us         2.13%      29.842us       4.974us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        15.44%     216.385us        15.44%     216.385us      36.064us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.91%      68.874us         6.21%      87.042us       3.627us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.30%      18.168us         1.30%      18.168us       0.757us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        16.19%     226.869us        16.19%     226.869us       4.726us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.37%       5.161us         0.37%       5.161us       5.161us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.380ms
-Self CUDA time total: 297.241us
+Self CPU time total: 1.401ms
+Self CUDA time total: 298.051us
 
 
 
@@ -4599,27 +4599,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     939.162us       529.48%     939.162us     939.162us             1  
-                                            torch_eager        11.57%     307.472us        99.80%       2.653ms       2.653ms       0.000us         0.00%     180.256us     180.256us             1  
-                                              aten::mul         5.55%     147.649us         9.66%     256.649us      10.694us      94.851us        53.47%      94.851us       3.952us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      94.851us        53.47%      94.851us       3.952us            24  
-                                            aten::copy_         3.85%     102.292us        68.52%       1.821ms     101.186us      57.759us        32.56%      60.639us       3.369us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.671us        22.93%      40.671us       3.389us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.766us        13.96%      24.766us       2.064us            12  
-                                            aten::clone         1.06%      28.080us        65.81%       1.749ms     291.547us       0.000us         0.00%      19.968us       3.328us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.088us         9.63%      17.088us       2.848us             6  
-                                              aten::add         1.13%      30.133us         1.96%      52.053us       8.675us      12.384us         6.98%      12.384us       2.064us             6  
-                                              aten::sub         1.27%      33.752us         2.15%      57.162us       9.527us      12.382us         6.98%      12.382us       2.064us             6  
-                                Activity Buffer Request        54.50%       1.449ms        54.50%       1.449ms       1.449ms       2.880us         1.62%       2.880us       2.880us             1  
-                                    aten::empty_strided         1.13%      30.142us         1.13%      30.142us       5.024us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         7.84%     208.428us         7.84%     208.428us      34.738us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.02%      80.309us         3.76%      99.911us       4.163us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.74%      19.602us         0.74%      19.602us       0.817us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.14%     216.293us         8.14%     216.293us       4.506us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.20%       5.200us         0.20%       5.200us       5.200us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     953.069us       538.57%     953.069us     953.069us             1  
+                                            torch_eager        19.36%     280.983us        99.62%       1.446ms       1.446ms       0.000us         0.00%     179.812us     179.812us             1  
+                                              aten::mul        10.74%     155.876us        18.65%     270.688us      11.279us      94.916us        53.64%      94.916us       3.955us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      94.916us        53.64%      94.916us       3.955us            24  
+                                            aten::copy_         7.70%     111.823us        43.62%     633.117us      35.173us      57.568us        32.53%      60.416us       3.356us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.544us        22.91%      40.544us       3.379us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.480us        13.83%      24.480us       2.040us            12  
+                                            aten::clone         1.50%      21.731us        37.58%     545.384us      90.897us       0.000us         0.00%      19.872us       3.312us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.024us         9.62%      17.024us       2.837us             6  
+                                              aten::add         2.38%      34.509us         4.05%      58.781us       9.797us      12.256us         6.93%      12.256us       2.043us             6  
+                                              aten::sub         2.51%      36.442us         4.13%      59.923us       9.987us      12.224us         6.91%      12.224us       2.037us             6  
+                                Activity Buffer Request        15.40%     223.485us        15.40%     223.485us     223.485us       2.848us         1.61%       2.848us       2.848us             1  
+                                    aten::empty_strided         2.13%      30.930us         2.13%      30.930us       5.155us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        15.79%     229.197us        15.79%     229.197us      38.200us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.88%      70.882us         6.18%      89.652us       3.735us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.29%      18.770us         1.29%      18.770us       0.782us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.93%     231.177us        15.93%     231.177us       4.816us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.38%       5.510us         0.38%       5.510us       5.510us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.658ms
-Self CUDA time total: 177.376us
+Self CPU time total: 1.451ms
+Self CUDA time total: 176.964us
 
 
 
@@ -4629,27 +4629,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     942.515us       317.36%     942.515us     942.515us             1  
-                                            torch_eager        20.57%     285.923us        99.62%       1.385ms       1.385ms       0.000us         0.00%     314.717us     314.717us             1  
-                                              aten::mul        10.73%     149.116us        18.62%     258.870us      10.786us     145.439us        48.97%     145.439us       6.060us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     145.439us        48.97%     145.439us       6.060us            24  
-                                            aten::copy_         7.46%     103.659us        42.33%     588.488us      32.694us     110.749us        37.29%     128.477us       7.138us            18  
-                                            aten::clone         1.56%      21.753us        36.61%     508.959us      84.826us       0.000us         0.00%      71.104us      11.851us             6  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.373us        19.32%      57.373us       4.781us            12  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.376us        17.97%      53.376us       8.896us             6  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      40.801us        13.74%      40.801us       3.400us            12  
-                                              aten::sub         2.38%      33.081us         4.03%      56.021us       9.337us      20.449us         6.89%      20.449us       3.408us             6  
-                                              aten::add         2.40%      33.331us         4.05%      56.271us       9.379us      20.352us         6.85%      20.352us       3.392us             6  
-                                Activity Buffer Request        14.18%     197.118us        14.18%     197.118us     197.118us      17.728us         5.97%      17.728us      17.728us             1  
-                                    aten::empty_strided         2.21%      30.780us         2.21%      30.780us       5.130us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        16.19%     225.018us        16.19%     225.018us      37.503us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.87%      67.722us         6.24%      86.713us       3.613us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.37%      18.991us         1.37%      18.991us       0.791us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        15.71%     218.327us        15.71%     218.327us       4.548us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.38%       5.310us         0.38%       5.310us       5.310us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     992.756us       332.77%     992.756us     992.756us             1  
+                                            torch_eager        20.12%     289.006us        99.66%       1.432ms       1.432ms       0.000us         0.00%     316.222us     316.222us             1  
+                                              aten::mul        11.31%     162.528us        19.47%     279.759us      11.657us     146.880us        49.23%     146.880us       6.120us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     146.880us        49.23%     146.880us       6.120us            24  
+                                            aten::copy_         7.73%     111.012us        41.48%     595.895us      33.105us     110.942us        37.19%     128.830us       7.157us            18  
+                                            aten::clone         1.55%      22.310us        35.21%     505.793us      84.299us       0.000us         0.00%      71.424us      11.904us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.406us        19.24%      57.406us       4.784us            12  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.536us        17.94%      53.536us       8.923us             6  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      40.512us        13.58%      40.512us       3.376us            12  
+                                              aten::add         2.53%      36.289us         4.25%      61.011us      10.169us      20.352us         6.82%      20.352us       3.392us             6  
+                                              aten::sub         2.59%      37.162us         4.41%      63.291us      10.549us      20.160us         6.76%      20.160us       3.360us             6  
+                                Activity Buffer Request        13.10%     188.164us        13.10%     188.164us     188.164us      17.888us         6.00%      17.888us      17.888us             1  
+                                    aten::empty_strided         2.24%      32.121us         2.24%      32.121us       5.354us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        15.74%     226.067us        15.74%     226.067us      37.678us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.81%      69.111us         6.15%      88.363us       3.682us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.34%      19.252us         1.34%      19.252us       0.802us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        16.62%     238.734us        16.62%     238.734us       4.974us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.34%       4.940us         0.34%       4.940us       4.940us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.390ms
-Self CUDA time total: 296.989us
+Self CPU time total: 1.437ms
+Self CUDA time total: 298.334us
 
 
 
@@ -4659,27 +4659,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     928.214us       158.30%     928.214us     928.214us             1  
-                                            torch_eager        21.21%     285.194us        99.61%       1.340ms       1.340ms       0.000us         0.00%     610.012us     610.012us             1  
-                                            aten::copy_         7.59%     102.047us        40.19%     540.521us      30.029us     268.445us        45.78%     292.093us      16.227us            18  
-                                              aten::mul        11.07%     148.860us        19.42%     261.184us      10.883us     251.679us        42.92%     251.679us      10.487us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     251.679us        42.92%     251.679us      10.487us            24  
-                                            aten::clone         1.57%      21.069us        34.26%     460.696us      76.783us       0.000us         0.00%     201.406us      33.568us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     177.758us        30.32%     177.758us      29.626us             6  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      90.687us        15.47%      90.687us       7.557us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      66.240us        11.30%      66.240us       5.520us            12  
-                                              aten::sub         2.72%      36.642us         4.50%      60.582us      10.097us      33.152us         5.65%      33.152us       5.525us             6  
-                                              aten::add         2.29%      30.800us         3.93%      52.901us       8.817us      33.088us         5.64%      33.088us       5.515us             6  
-                                Activity Buffer Request        12.31%     165.596us        12.31%     165.596us     165.596us      23.648us         4.03%      23.648us      23.648us             1  
-                                    aten::empty_strided         2.19%      29.501us         2.19%      29.501us       4.917us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        15.63%     210.266us        15.63%     210.266us      35.044us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         5.16%      69.374us         6.60%      88.734us       3.697us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.44%      19.360us         1.44%      19.360us       0.807us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        16.43%     220.977us        16.43%     220.977us       4.604us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.39%       5.180us         0.39%       5.180us       5.180us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     957.657us       163.29%     957.657us     957.657us             1  
+                                            torch_eager        20.09%     288.813us        99.63%       1.432ms       1.432ms       0.000us         0.00%     610.425us     610.425us             1  
+                                            aten::copy_         7.31%     105.011us        42.63%     612.724us      34.040us     268.572us        45.79%     292.508us      16.250us            18  
+                                              aten::mul        10.71%     153.870us        18.84%     270.776us      11.282us     252.607us        43.07%     252.607us      10.525us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     252.607us        43.07%     252.607us      10.525us            24  
+                                            aten::clone         1.42%      20.480us        36.58%     525.692us      87.615us       0.000us         0.00%     201.566us      33.594us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     177.630us        30.29%     177.630us      29.605us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      90.942us        15.51%      90.942us       7.578us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      65.310us        11.14%      65.310us       5.443us            12  
+                                              aten::sub         2.69%      38.720us         4.45%      63.991us      10.665us      32.991us         5.63%      32.991us       5.499us             6  
+                                              aten::add         2.37%      34.041us         3.93%      56.461us       9.410us      32.319us         5.51%      32.319us       5.387us             6  
+                                Activity Buffer Request        15.99%     229.866us        15.99%     229.866us     229.866us      23.936us         4.08%      23.936us      23.936us             1  
+                                    aten::empty_strided         2.02%      29.010us         2.02%      29.010us       4.835us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        14.72%     211.585us        14.72%     211.585us      35.264us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.83%      69.478us         6.24%      89.671us       3.736us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.40%      20.193us         1.40%      20.193us       0.841us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        16.06%     230.859us        16.06%     230.859us       4.810us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.37%       5.320us         0.37%       5.320us       5.320us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.345ms
-Self CUDA time total: 586.364us
+Self CPU time total: 1.437ms
+Self CUDA time total: 586.489us
 
 
 
@@ -4689,35 +4689,35 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         9.32%     323.657us        76.63%       2.662ms       2.662ms       0.000us         0.00%       1.834ms       1.834ms             1  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.806ms       102.11%       1.806ms       1.806ms             1  
-                                            aten::copy_         3.12%     108.276us        52.46%       1.822ms     101.225us     791.134us        44.74%     857.278us      47.627us            18  
-                                              aten::mul         4.16%     144.572us         7.37%     256.109us      10.671us     827.198us        46.78%     827.198us      34.467us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     827.198us        46.78%     827.198us      34.467us            24  
-                                            aten::clone         0.81%      28.142us        50.15%       1.742ms     290.300us       0.000us         0.00%     624.095us     104.016us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     557.951us        31.55%     557.951us      92.992us             6  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     233.183us        13.19%     233.183us      19.432us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     149.919us         8.48%     149.919us      12.493us            12  
-                                              aten::sub         0.98%      34.102us         1.65%      57.362us       9.560us      90.368us         5.11%      90.368us      15.061us             6  
-                                Activity Buffer Request        41.53%       1.443ms        41.53%       1.443ms       1.443ms      66.144us         3.74%      66.144us      66.144us             1  
-                                              aten::add         0.89%      30.740us         1.53%      53.293us       8.882us      59.551us         3.37%      59.551us       9.925us             6  
-                                    aten::empty_strided         0.86%      29.871us         0.86%      29.871us       4.979us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         5.94%     206.426us         5.94%     206.426us      34.404us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.06%      71.442us         2.62%      91.034us       3.793us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.56%      19.592us         0.56%      19.592us       0.816us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         6.40%     222.192us         6.40%     222.192us       4.629us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize        23.37%     811.698us        23.37%     811.698us     811.698us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         9.43%     329.378us        77.87%       2.720ms       2.720ms       0.000us         0.00%       1.842ms       1.842ms             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.815ms       102.19%       1.815ms       1.815ms             1  
+                                            aten::copy_         3.09%     107.951us        52.68%       1.840ms     102.235us     794.051us        44.71%     860.068us      47.782us            18  
+                                              aten::mul         4.59%     160.365us         8.02%     279.997us      11.667us     834.368us        46.99%     834.368us      34.765us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     834.368us        46.99%     834.368us      34.765us            24  
+                                            aten::clone         0.80%      28.034us        50.14%       1.751ms     291.882us       0.000us         0.00%     627.394us     104.566us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     561.377us        31.61%     561.377us      93.563us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     232.674us        13.10%     232.674us      19.389us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     147.392us         8.30%     147.392us      12.283us            12  
+                                              aten::sub         1.14%      39.970us         1.89%      66.170us      11.028us      89.952us         5.07%      89.952us      14.992us             6  
+                                Activity Buffer Request        41.31%       1.443ms        41.31%       1.443ms       1.443ms      66.017us         3.72%      66.017us      66.017us             1  
+                                              aten::add         0.95%      33.281us         1.61%      56.271us       9.379us      57.440us         3.23%      57.440us       9.573us             6  
+                                    aten::empty_strided         0.85%      29.670us         0.85%      29.670us       4.945us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         6.22%     217.146us         6.22%     217.146us      36.191us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.01%      70.292us         2.58%      90.182us       3.758us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.57%      19.890us         0.57%      19.890us       0.829us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         6.90%     240.975us         6.90%     240.975us       5.020us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize        22.13%     773.090us        22.13%     773.090us     773.090us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 3.473ms
-Self CUDA time total: 1.768ms
+Self CPU time total: 3.493ms
+Self CUDA time total: 1.776ms
 
 
 impl                     wl                  p50(ms)  ok
 torch_eager              cuda_B1_S128_H32_D128_R64     0.22  True
 torch_eager              cuda_B1_S128_H32_D64_R32     0.22  True
 torch_eager              cuda_B1_S128_H8_D128_R64     0.23  True
-torch_eager              cuda_B1_S128_H8_D64_R32     0.17  True
-torch_eager              cuda_B1_S2048_H32_D128_R64     0.22  True
+torch_eager              cuda_B1_S128_H8_D64_R32     0.18  True
+torch_eager              cuda_B1_S2048_H32_D128_R64     0.23  True
 torch_eager              cuda_B1_S2048_H32_D64_R32     0.22  True
 torch_eager              cuda_B1_S2048_H8_D128_R64     0.22  True
 torch_eager              cuda_B1_S2048_H8_D64_R32     0.22  True
@@ -4735,7 +4735,7 @@ torch_eager              cuda_B2_S2048_H8_D128_R64     0.22  True
 torch_eager              cuda_B2_S2048_H8_D64_R32     0.22  True
 torch_eager              cuda_B2_S512_H32_D128_R64     0.22  True
 torch_eager              cuda_B2_S512_H32_D64_R32     0.22  True
-torch_eager              cuda_B2_S512_H8_D128_R64     0.22  True
+torch_eager              cuda_B2_S512_H8_D128_R64     0.21  True
 torch_eager              cuda_B2_S512_H8_D64_R32     0.22  True
 
diff --git a/rotary/index.html b/rotary/index.html index cb1be8e4d680b5623caf2d05c1be684b075964b4..5ff503336b04c290f15ed24958b96a45568efad3 100644 --- a/rotary/index.html +++ b/rotary/index.html @@ -1,89 +1,3879 @@ - + - - - Index of /rotary - + + + index + + + + + -
- ← back -
-

Index of /rotary

- +
+
+ + ← back + +
light
+
reset
+ +
+
+ +
+
Generated on:
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35 +
+
+ +
+

Rotary Position Embeddings Benchmarks

+

This directory contains benchmarks for Rotary Position Embeddings (RoPE) implementations.

+

Implementations

+ +

Results

+ +
+ \ No newline at end of file diff --git a/rotary/results/artifacts/combine/latency.svg b/rotary/results/artifacts/combine/latency.svg index 793d43c4ad9f51efa85fd8e3504aaff6f6bbc3ad..3fdefb46544d73b9bc85fc2ae3e00add87b86535 100644 --- a/rotary/results/artifacts/combine/latency.svg +++ b/rotary/results/artifacts/combine/latency.svg @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0517a426384d0bc9df1932ace04595ea1867cb036e7fbeced61eb044cff2e335 +oid sha256:36e71e631ab1a00097df3bc72a4532b4b383ed31a1df2368bd041e765254a9c3 size 31018 diff --git a/rotary/results/combined_results.html b/rotary/results/combined_results.html index a1cdc49d1fc3934c88244cb81845c6ffb97c9784..17475d0e65452d0f310ef38d60c5c80c88e6833b 100644 --- a/rotary/results/combined_results.html +++ b/rotary/results/combined_results.html @@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: - 2025-10-28T14:09:08.848427 + 2025-10-29T14:27:54.393501 image/svg+xml @@ -4216,70 +4216,70 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: - + - + - 0.2 + 0.2 - + - + - 0.3 + 0.3 - + - + - 0.4 + 0.4 - + - + - 0.5 + 0.5 - + - + - 0.6 + 0.6 @@ -4287,34 +4287,34 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: - + - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + @@ -4364,7 +4364,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: combine | 4.36s +Cell: combine | 4.35s | Raw @@ -4453,7 +4453,7 @@ COMBINED BENCHMARK SUMMARY impl wl p50(ms) ok hf_kernels_rotary cuda_B1_S128_H32_D128_R64 0.09 False hf_kernels_rotary cuda_B1_S128_H32_D64_R32 0.09 False -hf_kernels_rotary cuda_B1_S128_H8_D128_R64 0.09 False +hf_kernels_rotary cuda_B1_S128_H8_D128_R64 0.10 False hf_kernels_rotary cuda_B1_S128_H8_D64_R32 0.08 False hf_kernels_rotary cuda_B1_S2048_H32_D128_R64 0.09 False hf_kernels_rotary cuda_B1_S2048_H32_D64_R32 0.09 False @@ -4478,8 +4478,8 @@ hf_kernels_rotary cuda_B2_S512_H8_D64_R32 0.09 False torch_eager cuda_B1_S128_H32_D128_R64 0.22 True torch_eager cuda_B1_S128_H32_D64_R32 0.22 True torch_eager cuda_B1_S128_H8_D128_R64 0.23 True -torch_eager cuda_B1_S128_H8_D64_R32 0.17 True -torch_eager cuda_B1_S2048_H32_D128_R64 0.22 True +torch_eager cuda_B1_S128_H8_D64_R32 0.18 True +torch_eager cuda_B1_S2048_H32_D128_R64 0.23 True torch_eager cuda_B1_S2048_H32_D64_R32 0.22 True torch_eager cuda_B1_S2048_H8_D128_R64 0.22 True torch_eager cuda_B1_S2048_H8_D64_R32 0.22 True @@ -4497,7 +4497,7 @@ torch_eager cuda_B2_S2048_H8_D128_R64 0.22 True torch_eager cuda_B2_S2048_H8_D64_R32 0.22 True torch_eager cuda_B2_S512_H32_D128_R64 0.22 True torch_eager cuda_B2_S512_H32_D64_R32 0.22 True -torch_eager cuda_B2_S512_H8_D128_R64 0.22 True +torch_eager cuda_B2_S512_H8_D128_R64 0.21 True torch_eager cuda_B2_S512_H8_D64_R32 0.22 True GENERATING COMBINED VISUALIZATION @@ -4518,7 +4518,7 @@ Implementations included:
▶ UV Install Logs
@@ -4531,7 +4531,7 @@ Installed 37 packages in 219ms - 2025-10-28T14:09:08.848427 + 2025-10-29T14:27:54.393501 image/svg+xml @@ -4875,70 +4875,70 @@ Installed 37 packages in 219ms - + - + - 0.2 + 0.2 - + - + - 0.3 + 0.3 - + - + - 0.4 + 0.4 - + - + - 0.5 + 0.5 - + - + - 0.6 + 0.6 @@ -4946,34 +4946,34 @@ Installed 37 packages in 219ms - + - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + +