diff --git a/activation/impls/artifacts/benchmark/activation.jsonl b/activation/impls/artifacts/benchmark/activation.jsonl index 93cac36bf4f689de57400a82e22b49cf0344ff7b..3e006de96bb2daedf135e24fcb717a0479a9b199 100644 --- a/activation/impls/artifacts/benchmark/activation.jsonl +++ b/activation/impls/artifacts/benchmark/activation.jsonl @@ -1,9 +1,9 @@ -{"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02251099999739381, "p50": 0.02324100000805629, "p90": 0.023539999972399528, "mean": 0.023146399996676337, "iqr": 0.0007499999696847226, "raw_times": [0.023539999972399528, 0.022790000002714805, 0.02324100000805629, 0.02365000000281725, 0.02251099999739381], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.029810000000907166, "peak_bytes": 1966080, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} -{"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.027370999987397227, "p50": 0.028240999995432503, "p90": 0.028329999963716546, "mean": 0.02825879998908931, "iqr": 0.00023899997358967084, "raw_times": [0.028090999990126875, 0.028240999995432503, 0.028329999963716546, 0.029261000008773408, 0.027370999987397227], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03212000001440174, "peak_bytes": 2621440, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} -{"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02641099996480989, "p50": 0.027520999992702855, "p90": 0.028440999983558868, "mean": 0.027734599996165343, "iqr": 0.001440999938040477, "raw_times": [0.02641099996480989, 0.028440999983558868, 0.029299999994236714, 0.027520999992702855, 0.02700000004551839], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.032080999972095015, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} -{"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.026321000007101247, "p50": 0.02724099999795726, "p90": 0.028659999998126295, "mean": 0.02923079999845868, "iqr": 0.0014990000067882647, "raw_times": [0.026321000007101247, 0.03677099999777056, 0.028659999998126295, 0.02716099999133803, 0.02724099999795726], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031121000006351096, "peak_bytes": 3932160, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} -{"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.025979999975334067, "p50": 0.028520999990178098, "p90": 0.028720999978304462, "mean": 0.027810800008865044, "iqr": 0.00169999992749581, "raw_times": [0.025979999975334067, 0.028811000049699942, 0.028520999990178098, 0.027021000050808652, 0.028720999978304462], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.02976100000751103, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} -{"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.026359999992564553, "p50": 0.027051000017763727, "p90": 0.027101000000584463, "mean": 0.027004599996871548, "iqr": 0.00035100003970001126, "raw_times": [0.027101000000584463, 0.027051000017763727, 0.027761000012560544, 0.026749999960884452, 0.026359999992564553], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.029620999953294813, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} -{"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02576100001761006, "p50": 0.027530000011211087, "p90": 0.02828099997032041, "mean": 0.0273743999969156, "iqr": 0.001340999972399004, "raw_times": [0.02576100001761006, 0.02828099997032041, 0.026939999997921404, 0.02835999998751504, 0.027530000011211087], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.030121000008875853, "peak_bytes": 7864320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} -{"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.025459999960730784, "p50": 0.028590999988864496, "p90": 0.02870100001928222, "mean": 0.027812799999082927, "iqr": 0.00113999999484804, "raw_times": [0.025459999960730784, 0.02870100001928222, 0.028751000002102955, 0.02756100002443418, 0.028590999988864496], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.029950999987704563, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} -{"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02632999996876606, "p50": 0.027500999976837193, "p90": 0.028640000039104052, "mean": 0.028318399995441723, "iqr": 0.0021100000253682083, "raw_times": [0.02632999996876606, 0.03259099997876547, 0.027500999976837193, 0.026530000013735844, 0.028640000039104052], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.029991000019435887, "peak_bytes": 20971520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-29T15:50:51Z", "run": "e54f22d2514045f0929111d26589784b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.040432000048440386, "p50": 0.04165099994679622, "p90": 0.0417410000181917, "mean": 0.04172699999571705, "iqr": 0.0011400000516914588, "raw_times": [0.0417410000181917, 0.04420999999865671, 0.040432000048440386, 0.04165099994679622, 0.04060099996650024], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.046430999987023824, "peak_bytes": 1966080, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-29T15:50:51Z", "run": "e54f22d2514045f0929111d26589784b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04963099996757592, "p50": 0.05265099997586731, "p90": 0.053851000018312334, "mean": 0.054568999985349365, "iqr": 0.0016500000583619112, "raw_times": [0.04963099996757592, 0.05265099997586731, 0.05220099995995042, 0.053851000018312334, 0.06451100000504084], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05472100002634761, "peak_bytes": 2621440, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-29T15:50:51Z", "run": "e54f22d2514045f0929111d26589784b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04966099999137441, "p50": 0.05102099999021448, "p90": 0.05103099999814731, "mean": 0.05151719999503257, "iqr": 0.0007099999947968172, "raw_times": [0.04966099999137441, 0.05555199999207616, 0.05032100000335049, 0.05102099999021448, 0.05103099999814731], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05423200002496742, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-29T15:50:51Z", "run": "e54f22d2514045f0929111d26589784b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04886099998202553, "p50": 0.05024199998615586, "p90": 0.0503609999782384, "mean": 0.05005519998348973, "iqr": 0.0007900000014160469, "raw_times": [0.04886099998202553, 0.04957099997682235, 0.051240999994206504, 0.05024199998615586, 0.0503609999782384], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.053871000034177996, "peak_bytes": 3932160, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-29T15:50:51Z", "run": "e54f22d2514045f0929111d26589784b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04914099997677113, "p50": 0.04985100002841136, "p90": 0.05049099996767836, "mean": 0.04988699998875745, "iqr": 0.0013399999829744047, "raw_times": [0.04915099998470396, 0.05080099998622245, 0.04985100002841136, 0.04914099997677113, 0.05049099996767836], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.053920999960155314, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-29T15:50:51Z", "run": "e54f22d2514045f0929111d26589784b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04656100003330721, "p50": 0.04960100000062084, "p90": 0.05333199999313365, "mean": 0.05254540001260466, "iqr": 0.0039209999727063405, "raw_times": [0.04656100003330721, 0.05333199999313365, 0.04960100000062084, 0.04941100002042731, 0.06382200001553429], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.051971000004868984, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-29T15:50:51Z", "run": "e54f22d2514045f0929111d26589784b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04889099994898061, "p50": 0.050290999979552, "p90": 0.05037099998617123, "mean": 0.05047499996635452, "iqr": 0.0002600000357233512, "raw_times": [0.04889099994898061, 0.052710999966620875, 0.050110999950447876, 0.05037099998617123, 0.050290999979552], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05234200000359124, "peak_bytes": 7864320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-29T15:50:51Z", "run": "e54f22d2514045f0929111d26589784b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0489209999727791, "p50": 0.04973099999006081, "p90": 0.05078099997035679, "mean": 0.051391199974659685, "iqr": 0.0012099999935344385, "raw_times": [0.0489209999727791, 0.05078099997035679, 0.04973099999006081, 0.04957099997682235, 0.05795199996327938], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0512020000087432, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-29T15:50:51Z", "run": "e54f22d2514045f0929111d26589784b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04852099999652637, "p50": 0.04917100000056962, "p90": 0.049370999988695985, "mean": 0.049055200008751854, "iqr": 0.0007299999538190605, "raw_times": [0.04852099999652637, 0.048641000034876924, 0.04917100000056962, 0.049370999988695985, 0.04957200002309037], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05309099998385136, "peak_bytes": 20971520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} diff --git a/activation/impls/cells/benchmark.py b/activation/impls/cells/benchmark.py index 04f9df27c14acf429b58dba6cf0677c00cbbbced..711af9e01652ef5081b507affd0f7df9ac99e644 100644 --- a/activation/impls/cells/benchmark.py +++ b/activation/impls/cells/benchmark.py @@ -4,7 +4,6 @@ # "numpy", # "torch==2.8.0", # "kernels-benchmark-tools", -# "kernels", # ] # # [tool.uv.sources] @@ -13,22 +12,17 @@ import torch import sys from kernels_benchmark_tools import KernelTypeEnum, run_benchmark -from kernels import get_kernel +import torch, torch.nn.functional as F -# Load the activation kernel -activation = get_kernel("kernels-community/activation") - -def hf_kernels_swiglu(input_tensor): - hidden_dim = input_tensor.shape[-1] // 2 - out_shape = input_tensor.shape[:-1] + (hidden_dim,) - out = torch.empty(out_shape, dtype=input_tensor.dtype, device=input_tensor.device) - return activation.silu_and_mul(out, input_tensor) +def swiglu_eager(x): + d = x.shape[-1] // 2 + return F.silu(x[..., :d]) * x[..., d:] run_benchmark( kernel_type=KernelTypeEnum.ACTIVATION, - impl_name="hf_kernels_swiglu", - impl_tags={"family": "hf-kernels", "backend": "cuda"}, - impl_func=hf_kernels_swiglu, + impl_name="torch_eager", + impl_tags={"family":"hf-kernels", "backend":"eager"}, + impl_func=swiglu_eager, ) \ No newline at end of file diff --git a/activation/impls/hf_kernels_swiglu.html b/activation/impls/hf_kernels_swiglu.html index 0ee10cb621cd4a8fa09e449aade63a5a1449d022..8384c1a7290b7dc38496729333c6a6825ebff89b 100644 --- a/activation/impls/hf_kernels_swiglu.html +++ b/activation/impls/hf_kernels_swiglu.html @@ -809,6 +809,14 @@ .artifact-preview svg { background: transparent; } + /* Invert SVG images in dark mode */ + :root[data-theme="dark"] .artifact-preview img[src$=".svg"] { + filter: invert(0.9) hue-rotate(180deg); + } + /* Keep SVG images readable in monocolor mode */ + :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] { + filter: none; + } /* CSV table styling */ .artifact-csv { margin-top: 1rem; @@ -3871,7 +3879,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: nv | 0.26s +Cell: nv | 0.21s | Raw @@ -3887,7 +3895,7 @@ Cell: nv | 0.26s
-
Wed Oct 29 14:26:44 2025       
+
Wed Oct 29 15:50:40 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -3896,7 +3904,7 @@ Cell: nv | 0.26s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   32C    P0            133W /  350W |       0MiB /  46068MiB |    100%      Default |
+| N/A   28C    P0             78W /  350W |       0MiB /  46068MiB |     11%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3920,7 +3928,7 @@ Cell: nv | 0.26s
 ▼ output
  ▶ uv-logs
  | 
-Cell: benchmark | 4.19s
+Cell: benchmark | 7.78s
  | 
 
 Raw
@@ -3976,17 +3984,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D768
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      72.288us      1807.20%      72.288us      72.288us             1  
-                                      hf_kernels_swiglu        12.07%     211.387us        99.59%       1.744ms       1.744ms       0.000us         0.00%       5.376us       5.376us             1  
-                      _activation_beeaae6::silu_and_mul         1.10%      19.319us        84.87%       1.486ms     495.368us       4.000us       100.00%       5.376us       1.792us             3  
-void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       4.000us       100.00%       4.000us       1.333us             3  
-                                Activity Buffer Request        81.49%       1.427ms        81.49%       1.427ms       1.427ms       1.376us        34.40%       1.376us       1.376us             1  
-                                            aten::empty         2.64%      46.231us         2.64%      46.231us      15.410us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         2.28%      39.911us         2.28%      39.911us      13.304us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.41%       7.220us         0.41%       7.220us       7.220us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      79.968us      1983.33%      79.968us      79.968us             1  
+                                      hf_kernels_swiglu        10.58%     184.424us        99.57%       1.736ms       1.736ms       0.000us         0.00%       5.408us       5.408us             1  
+                      _activation_beeaae6::silu_and_mul         1.26%      21.900us        86.25%       1.504ms     501.188us       4.032us       100.00%       5.408us       1.803us             3  
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       4.032us       100.00%       4.032us       1.344us             3  
+                                Activity Buffer Request        82.49%       1.438ms        82.49%       1.438ms       1.438ms       1.376us        34.13%       1.376us       1.376us             1  
+                                            aten::empty         2.74%      47.772us         2.74%      47.772us      15.924us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         2.50%      43.631us         2.50%      43.631us      14.544us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.43%       7.440us         0.43%       7.440us       7.440us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.751ms
-Self CUDA time total: 4.000us
+Self CPU time total: 1.743ms
+Self CUDA time total: 4.032us
 
 
 
@@ -3996,16 +4004,16 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      62.686us      1579.79%      62.686us      62.686us             1  
-                                      hf_kernels_swiglu         6.72%     108.943us        99.67%       1.616ms       1.616ms       0.000us         0.00%       5.312us       5.312us             1  
-                      _activation_beeaae6::silu_and_mul         1.34%      21.721us        91.77%       1.488ms     495.875us       3.968us       100.00%       5.312us       1.771us             3  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      60.192us      1516.94%      60.192us      60.192us             1  
+                                      hf_kernels_swiglu         5.66%      89.803us        99.62%       1.581ms       1.581ms       0.000us         0.00%       5.312us       5.312us             1  
+                      _activation_beeaae6::silu_and_mul         1.35%      21.470us        92.79%       1.473ms     491.035us       3.968us       100.00%       5.312us       1.771us             3  
 void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       3.968us       100.00%       3.968us       1.323us             3  
-                                Activity Buffer Request        88.82%       1.440ms        88.82%       1.440ms       1.440ms       1.344us        33.87%       1.344us       1.344us             1  
-                                            aten::empty         1.18%      19.150us         1.18%      19.150us       6.383us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         1.61%      26.150us         1.61%      26.150us       8.717us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.33%       5.310us         0.33%       5.310us       5.310us       0.000us         0.00%       0.000us       0.000us             1  
+                                Activity Buffer Request        89.86%       1.427ms        89.86%       1.427ms       1.427ms       1.344us        33.87%       1.344us       1.344us             1  
+                                            aten::empty         1.17%      18.590us         1.17%      18.590us       6.197us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.58%      25.022us         1.58%      25.022us       8.341us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.38%       6.110us         0.38%       6.110us       6.110us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.621ms
+Self CPU time total: 1.588ms
 Self CUDA time total: 3.968us
 
 
@@ -4016,17 +4024,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      66.687us      1361.79%      66.687us      66.687us             1  
-                                      hf_kernels_swiglu         6.74%     109.943us        99.70%       1.626ms       1.626ms       0.000us         0.00%       6.529us       6.529us             1  
-                      _activation_beeaae6::silu_and_mul         1.25%      20.459us        91.78%       1.496ms     498.816us       4.897us       100.00%       6.529us       2.176us             3  
-void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       4.897us       100.00%       4.897us       1.632us             3  
-                                Activity Buffer Request        88.91%       1.450ms        88.91%       1.450ms       1.450ms       1.632us        33.33%       1.632us       1.632us             1  
-                                            aten::empty         1.18%      19.260us         1.18%      19.260us       6.420us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         1.61%      26.232us         1.61%      26.232us       8.744us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.30%       4.870us         0.30%       4.870us       4.870us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      65.535us      1338.54%      65.535us      65.535us             1  
+                                      hf_kernels_swiglu         5.56%      88.483us        99.64%       1.586ms       1.586ms       0.000us         0.00%       6.528us       6.528us             1  
+                      _activation_beeaae6::silu_and_mul         1.35%      21.452us        92.87%       1.478ms     492.822us       4.896us       100.00%       6.528us       2.176us             3  
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       4.896us       100.00%       4.896us       1.632us             3  
+                                Activity Buffer Request        89.90%       1.431ms        89.90%       1.431ms       1.431ms       1.632us        33.33%       1.632us       1.632us             1  
+                                            aten::empty         1.21%      19.310us         1.21%      19.310us       6.437us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.63%      25.910us         1.63%      25.910us       8.637us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.36%       5.661us         0.36%       5.661us       5.661us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.631ms
-Self CUDA time total: 4.897us
+Self CPU time total: 1.592ms
+Self CUDA time total: 4.896us
 
 
 
@@ -4036,17 +4044,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D768
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      66.081us      1552.66%      66.081us      66.081us             1  
-                                      hf_kernels_swiglu         6.15%     108.423us        99.71%       1.758ms       1.758ms       0.000us         0.00%       5.696us       5.696us             1  
-                      _activation_beeaae6::silu_and_mul         1.25%      22.001us        92.49%       1.631ms     543.697us       4.256us       100.00%       5.696us       1.899us             3  
-void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       4.256us       100.00%       4.256us       1.419us             3  
-                                Activity Buffer Request        80.93%       1.427ms        80.93%       1.427ms       1.427ms       1.440us        33.83%       1.440us       1.440us             1  
-                                            aten::empty         1.07%      18.910us         1.07%      18.910us       6.303us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        10.31%     181.874us        10.31%     181.874us      60.625us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.29%       5.110us         0.29%       5.110us       5.110us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      67.008us      1562.69%      67.008us      67.008us             1  
+                                      hf_kernels_swiglu         4.93%      90.832us        99.72%       1.836ms       1.836ms       0.000us         0.00%       5.728us       5.728us             1  
+                      _activation_beeaae6::silu_and_mul         1.23%      22.581us        93.74%       1.726ms     575.177us       4.288us       100.00%       5.728us       1.909us             3  
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       4.288us       100.00%       4.288us       1.429us             3  
+                                Activity Buffer Request        81.40%       1.498ms        81.40%       1.498ms       1.498ms       1.440us        33.58%       1.440us       1.440us             1  
+                                            aten::empty         1.04%      19.180us         1.04%      19.180us       6.393us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        11.11%     204.595us        11.11%     204.595us      68.198us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.28%       5.180us         0.28%       5.180us       5.180us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.764ms
-Self CUDA time total: 4.256us
+Self CPU time total: 1.841ms
+Self CUDA time total: 4.288us
 
 
 
@@ -4056,17 +4064,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      63.167us      1072.63%      63.167us      63.167us             1  
-                                      hf_kernels_swiglu        15.22%      87.332us        99.19%     569.294us     569.294us       0.000us         0.00%       7.873us       7.873us             1  
-                      _activation_beeaae6::silu_and_mul         3.58%      20.570us        80.67%     463.002us     154.334us       5.889us       100.00%       7.873us       2.624us             3  
-void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       5.889us       100.00%       5.889us       1.963us             3  
-                                Activity Buffer Request        48.76%     279.877us        48.76%     279.877us     279.877us       1.984us        33.69%       1.984us       1.984us             1  
-                                            aten::empty         3.30%      18.960us         3.30%      18.960us       6.320us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        28.32%     162.555us        28.32%     162.555us      54.185us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.81%       4.660us         0.81%       4.660us       4.660us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      64.800us      1106.37%      64.800us      64.800us             1  
+                                      hf_kernels_swiglu         5.65%      97.973us        99.69%       1.728ms       1.728ms       0.000us         0.00%       7.810us       7.810us             1  
+                      _activation_beeaae6::silu_and_mul         1.27%      22.090us        92.96%       1.611ms     536.996us       5.857us       100.00%       7.810us       2.603us             3  
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       5.857us       100.00%       5.857us       1.952us             3  
+                                Activity Buffer Request        82.37%       1.427ms        82.37%       1.427ms       1.427ms       1.953us        33.34%       1.953us       1.953us             1  
+                                            aten::empty         1.09%      18.810us         1.09%      18.810us       6.270us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         9.31%     161.434us         9.31%     161.434us      53.811us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.31%       5.300us         0.31%       5.300us       5.300us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 573.954us
-Self CUDA time total: 5.889us
+Self CPU time total: 1.733ms
+Self CUDA time total: 5.857us
 
 
 
@@ -4076,17 +4084,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      69.632us       906.67%      69.632us      69.632us             1  
-                                      hf_kernels_swiglu         6.07%     107.484us        99.73%       1.766ms       1.766ms       0.000us         0.00%      10.240us      10.240us             1  
-                      _activation_beeaae6::silu_and_mul         1.19%      21.010us        92.55%       1.639ms     546.413us       7.680us       100.00%      10.240us       3.413us             3  
-void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       7.680us       100.00%       7.680us       2.560us             3  
-                                Activity Buffer Request        81.69%       1.447ms        81.69%       1.447ms       1.447ms       2.560us        33.33%       2.560us       2.560us             1  
-                                            aten::empty         1.11%      19.720us         1.11%      19.720us       6.573us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         9.67%     171.234us         9.67%     171.234us      57.078us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.27%       4.800us         0.27%       4.800us       4.800us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      77.311us      1002.48%      77.311us      77.311us             1  
+                                      hf_kernels_swiglu        20.04%      98.272us        98.88%     484.972us     484.972us       0.000us         0.00%      10.304us      10.304us             1  
+                      _activation_beeaae6::silu_and_mul         4.97%      24.390us        74.66%     366.210us     122.070us       7.712us       100.00%      10.304us       3.435us             3  
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       7.712us       100.00%       7.712us       2.571us             3  
+                                Activity Buffer Request        34.13%     167.415us        34.13%     167.415us     167.415us       2.592us        33.61%       2.592us       2.592us             1  
+                                            aten::empty         4.18%      20.490us         4.18%      20.490us       6.830us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        35.56%     174.405us        35.56%     174.405us      58.135us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.12%       5.511us         1.12%       5.511us       5.511us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.771ms
-Self CUDA time total: 7.680us
+Self CPU time total: 490.483us
+Self CUDA time total: 7.712us
 
 
 
@@ -4096,16 +4104,16 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D768
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      72.064us      1098.54%      72.064us      72.064us             1  
-                                      hf_kernels_swiglu         6.19%     109.521us        99.72%       1.763ms       1.763ms       0.000us         0.00%       8.768us       8.768us             1  
-                      _activation_beeaae6::silu_and_mul         1.22%      21.580us        92.43%       1.635ms     544.850us       6.560us       100.00%       8.768us       2.923us             3  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      63.327us       965.35%      63.327us      63.327us             1  
+                                      hf_kernels_swiglu        20.14%      83.823us        98.84%     411.400us     411.400us       0.000us         0.00%       8.768us       8.768us             1  
+                      _activation_beeaae6::silu_and_mul         5.43%      22.601us        74.29%     309.187us     103.062us       6.560us       100.00%       8.768us       2.923us             3  
 void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       6.560us       100.00%       6.560us       2.187us             3  
-                                Activity Buffer Request        81.92%       1.449ms        81.92%       1.449ms       1.449ms       2.208us        33.66%       2.208us       2.208us             1  
-                                            aten::empty         1.09%      19.351us         1.09%      19.351us       6.450us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         9.29%     164.205us         9.29%     164.205us      54.735us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.28%       4.990us         0.28%       4.990us       4.990us       0.000us         0.00%       0.000us       0.000us             1  
+                                Activity Buffer Request        32.27%     134.313us        32.27%     134.313us     134.313us       2.208us        33.66%       2.208us       2.208us             1  
+                                            aten::empty         4.42%      18.390us         4.42%      18.390us       6.130us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        36.59%     152.273us        36.59%     152.273us      50.758us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.16%       4.810us         1.16%       4.810us       4.810us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.768ms
+Self CPU time total: 416.210us
 Self CUDA time total: 6.560us
 
 
@@ -4116,16 +4124,16 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      65.118us       692.16%      65.118us      65.118us             1  
-                                      hf_kernels_swiglu        16.62%      89.683us        99.03%     534.374us     534.374us       0.000us         0.00%      12.576us      12.576us             1  
-                      _activation_beeaae6::silu_and_mul         3.96%      21.372us        78.99%     426.201us     142.067us       9.408us       100.00%      12.576us       4.192us             3  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      69.952us       743.54%      69.952us      69.952us             1  
+                                      hf_kernels_swiglu         5.37%      93.270us        99.70%       1.733ms       1.733ms       0.000us         0.00%      12.544us      12.544us             1  
+                      _activation_beeaae6::silu_and_mul         1.28%      22.251us        93.17%       1.619ms     539.830us       9.408us       100.00%      12.544us       4.181us             3  
 void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       9.408us       100.00%       9.408us       3.136us             3  
-                                Activity Buffer Request        44.61%     240.735us        44.61%     240.735us     240.735us       3.168us        33.67%       3.168us       3.168us             1  
-                                            aten::empty         3.43%      18.490us         3.43%      18.490us       6.163us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        30.41%     164.094us        30.41%     164.094us      54.698us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.97%       5.210us         0.97%       5.210us       5.210us       0.000us         0.00%       0.000us       0.000us             1  
+                                Activity Buffer Request        83.02%       1.443ms        83.02%       1.443ms       1.443ms       3.136us        33.33%       3.136us       3.136us             1  
+                                            aten::empty         1.17%      20.271us         1.17%      20.271us       6.757us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         8.87%     154.165us         8.87%     154.165us      51.388us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.30%       5.210us         0.30%       5.210us       5.210us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 539.584us
+Self CPU time total: 1.738ms
 Self CUDA time total: 9.408us
 
 
@@ -4136,17 +4144,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      69.182us       527.34%      69.182us      69.182us             1  
-                                      hf_kernels_swiglu        12.86%     103.214us        99.41%     797.800us     797.800us       0.000us         0.00%      17.534us      17.534us             1  
-                      _activation_beeaae6::silu_and_mul         2.63%      21.139us        84.20%     675.726us     225.242us      13.119us       100.00%      17.534us       5.845us             3  
-void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us      13.119us       100.00%      13.119us       4.373us             3  
-                                Activity Buffer Request        61.21%     491.232us        61.21%     491.232us     491.232us       4.415us        33.65%       4.415us       4.415us             1  
-                                            aten::empty         2.35%      18.860us         2.35%      18.860us       6.287us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        20.35%     163.355us        20.35%     163.355us      54.452us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.59%       4.750us         0.59%       4.750us       4.750us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      65.278us       502.45%      65.278us      65.278us             1  
+                                      hf_kernels_swiglu        20.56%      86.143us        98.78%     413.910us     413.910us       0.000us         0.00%      17.344us      17.344us             1  
+                      _activation_beeaae6::silu_and_mul         5.61%      23.493us        73.70%     308.818us     102.939us      12.992us       100.00%      17.344us       5.781us             3  
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us      12.992us       100.00%      12.992us       4.331us             3  
+                                Activity Buffer Request        31.64%     132.592us        31.64%     132.592us     132.592us       4.352us        33.50%       4.352us       4.352us             1  
+                                            aten::empty         4.52%      18.949us         4.52%      18.949us       6.316us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        36.45%     152.733us        36.45%     152.733us      50.911us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.22%       5.130us         1.22%       5.130us       5.130us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 802.550us
-Self CUDA time total: 13.119us
+Self CPU time total: 419.040us
+Self CUDA time total: 12.992us
 
 
 impl                     wl                  p50(ms)  ok
@@ -4163,12 +4171,12 @@ hf_kernels_swiglu        cuda_T512_D768         0.03  True
 
▶ UV Install Logs
Fetching 7 files: 0%| | 0/7 [00:00<?, ?it/s] -Fetching 7 files: 71%|███████▏ | 5/7 [00:00<00:00, 14.29it/s] -Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 19.98it/s]
+Fetching 7 files: 71%|███████▏ | 5/7 [00:00<00:00, 17.75it/s] +Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 24.82it/s]

Artifacts:

activation.jsonl diff --git a/activation/impls/torch_swiglu.html b/activation/impls/torch_swiglu.html index 6e53efa4229f749d46be9ca846a20dfeed1ecd5d..cb77a7f7040f9e7b74d60407bd43687879e4b072 100644 --- a/activation/impls/torch_swiglu.html +++ b/activation/impls/torch_swiglu.html @@ -809,6 +809,14 @@ .artifact-preview svg { background: transparent; } + /* Invert SVG images in dark mode */ + :root[data-theme="dark"] .artifact-preview img[src$=".svg"] { + filter: invert(0.9) hue-rotate(180deg); + } + /* Keep SVG images readable in monocolor mode */ + :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] { + filter: none; + } /* CSV table styling */ .artifact-csv { margin-top: 1rem; @@ -3871,7 +3879,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: nv | 0.26s +Cell: nv | 0.21s | Raw @@ -3887,7 +3895,7 @@ Cell: nv | 0.26s
-
Wed Oct 29 14:26:44 2025       
+
Wed Oct 29 15:50:40 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -3896,7 +3904,7 @@ Cell: nv | 0.26s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   32C    P0            133W /  350W |       0MiB /  46068MiB |    100%      Default |
+| N/A   28C    P0             78W /  350W |       0MiB /  46068MiB |     11%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3918,9 +3926,9 @@ Cell: nv | 0.26s
 
 ▼ code 
 ▼ output
- ▶ uv-logs
+ ▶ uv-logs
  | 
-Cell: benchmark | 6.86s
+Cell: benchmark | 3.39s
  | 
 
 Raw
@@ -3970,19 +3978,19 @@ PROFILE TRACE: torch_eager | cuda_T128_D768
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     189.470us      1483.94%     189.470us     189.470us             1  
-                                            torch_eager        11.64%     220.727us        99.60%       1.889ms       1.889ms       0.000us         0.00%      15.103us      15.103us             1  
-                                             aten::silu         3.36%      63.732us        81.84%       1.552ms     517.326us       6.559us        51.37%       8.894us       2.965us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.559us        51.37%       6.559us       2.186us             3  
-                                              aten::mul         1.83%      34.608us         3.05%      57.780us      19.260us       6.209us        48.63%       6.209us       2.070us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.209us        48.63%       6.209us       2.070us             3  
-                                Activity Buffer Request        76.17%       1.444ms        76.17%       1.444ms       1.444ms       2.335us        18.29%       2.335us       2.335us             1  
-                                            aten::slice         2.47%      46.790us         3.07%      58.281us       9.714us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.61%      11.491us         0.61%      11.491us       1.915us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         3.54%      67.043us         3.54%      67.043us      11.174us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.40%       7.531us         0.40%       7.531us       7.531us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     198.560us      1555.14%     198.560us     198.560us             1  
+                                            torch_eager        10.82%     202.394us        99.60%       1.864ms       1.864ms       0.000us         0.00%      15.104us      15.104us             1  
+                                             aten::silu         3.05%      57.001us        82.79%       1.549ms     516.356us       6.560us        51.38%       8.896us       2.965us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.560us        51.38%       6.560us       2.187us             3  
+                                              aten::mul         1.85%      34.663us         3.11%      58.253us      19.418us       6.208us        48.62%       6.208us       2.069us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.208us        48.62%       6.208us       2.069us             3  
+                                Activity Buffer Request        77.33%       1.447ms        77.33%       1.447ms       1.447ms       2.336us        18.30%       2.336us       2.336us             1  
+                                            aten::slice         2.27%      42.481us         2.88%      53.841us       8.973us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.61%      11.360us         0.61%      11.360us       1.893us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         3.67%      68.681us         3.67%      68.681us      11.447us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.40%       7.560us         0.40%       7.560us       7.560us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.896ms
+Self CPU time total: 1.871ms
 Self CUDA time total: 12.768us
 
 
@@ -3993,20 +4001,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     160.895us      1299.43%     160.895us     160.895us             1  
-                                            torch_eager         6.82%     117.243us        99.71%       1.713ms       1.713ms       0.000us         0.00%      14.558us      14.558us             1  
-                                             aten::silu         2.46%      42.340us        88.23%       1.516ms     505.362us       6.399us        51.68%       8.575us       2.858us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.399us        51.68%       6.399us       2.133us             3  
-                                              aten::mul         1.64%      28.101us         2.83%      48.681us      16.227us       5.983us        48.32%       5.983us       1.994us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.983us        48.32%       5.983us       1.994us             3  
-                                Activity Buffer Request        84.10%       1.445ms        84.10%       1.445ms       1.445ms       2.176us        17.57%       2.176us       2.176us             1  
-                                            aten::slice         1.47%      25.252us         1.82%      31.222us       5.204us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.35%       5.970us         0.35%       5.970us       0.995us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         2.87%      49.290us         2.87%      49.290us       8.215us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.29%       5.020us         0.29%       5.020us       5.020us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     153.854us      1245.68%     153.854us     153.854us             1  
+                                            torch_eager         7.83%     135.935us        99.65%       1.729ms       1.729ms       0.000us         0.00%      14.495us      14.495us             1  
+                                             aten::silu         2.47%      42.821us        87.44%       1.517ms     505.699us       6.399us        51.81%       8.543us       2.848us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.399us        51.81%       6.399us       2.133us             3  
+                                              aten::mul         1.58%      27.360us         2.69%      46.680us      15.560us       5.952us        48.19%       5.952us       1.984us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.952us        48.19%       5.952us       1.984us             3  
+                                Activity Buffer Request        83.34%       1.446ms        83.34%       1.446ms       1.446ms       2.144us        17.36%       2.144us       2.144us             1  
+                                            aten::slice         1.38%      23.991us         1.69%      29.361us       4.893us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.31%       5.370us         0.31%       5.370us       0.895us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         2.74%      47.550us         2.74%      47.550us       7.925us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.35%       6.041us         0.35%       6.041us       6.041us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.718ms
-Self CUDA time total: 12.382us
+Self CPU time total: 1.735ms
+Self CUDA time total: 12.351us
 
 
 
@@ -4016,20 +4024,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     157.982us      1195.38%     157.982us     157.982us             1  
-                                            torch_eager         6.51%     110.244us        99.65%       1.686ms       1.686ms       0.000us         0.00%      15.488us      15.488us             1  
-                                             aten::silu         2.52%      42.653us        88.50%       1.498ms     499.192us       6.784us        51.33%       9.056us       3.019us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.784us        51.33%       6.784us       2.261us             3  
-                                              aten::mul         1.66%      28.021us         2.76%      46.791us      15.597us       6.432us        48.67%       6.432us       2.144us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.432us        48.67%       6.432us       2.144us             3  
-                                Activity Buffer Request        84.30%       1.427ms        84.30%       1.427ms       1.427ms       2.272us        17.19%       2.272us       2.272us             1  
-                                            aten::slice         1.51%      25.627us         1.87%      31.700us       5.283us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.36%       6.073us         0.36%       6.073us       1.012us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         2.78%      47.050us         2.78%      47.050us       7.842us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.35%       5.950us         0.35%       5.950us       5.950us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     152.990us      1157.70%     152.990us     152.990us             1  
+                                            torch_eager         7.93%     136.944us        99.69%       1.722ms       1.722ms       0.000us         0.00%      15.487us      15.487us             1  
+                                             aten::silu         2.43%      41.922us        87.32%       1.508ms     502.829us       6.752us        51.09%       9.024us       3.008us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.752us        51.09%       6.752us       2.251us             3  
+                                              aten::mul         1.55%      26.841us         2.71%      46.791us      15.597us       6.463us        48.91%       6.463us       2.154us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.463us        48.91%       6.463us       2.154us             3  
+                                Activity Buffer Request        83.33%       1.439ms        83.33%       1.439ms       1.439ms       2.272us        17.19%       2.272us       2.272us             1  
+                                            aten::slice         1.41%      24.420us         1.74%      29.990us       4.998us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.32%       5.570us         0.32%       5.570us       0.928us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         2.72%      47.030us         2.72%      47.030us       7.838us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.31%       5.290us         0.31%       5.290us       5.290us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.692ms
-Self CUDA time total: 13.216us
+Self CPU time total: 1.728ms
+Self CUDA time total: 13.215us
 
 
 
@@ -4039,20 +4047,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D768
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     159.902us      1258.67%     159.902us     159.902us             1  
-                                            torch_eager         6.73%     114.317us        99.66%       1.694ms       1.694ms       0.000us         0.00%      14.912us      14.912us             1  
-                                             aten::silu         2.46%      41.881us        88.34%       1.501ms     500.465us       6.560us        51.64%       8.768us       2.923us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.560us        51.64%       6.560us       2.187us             3  
-                                              aten::mul         1.68%      28.581us         2.79%      47.441us      15.814us       6.144us        48.36%       6.144us       2.048us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.144us        48.36%       6.144us       2.048us             3  
-                                Activity Buffer Request        74.33%       1.263ms        74.33%       1.263ms       1.263ms       2.208us        17.38%       2.208us       2.208us             1  
-                                            aten::slice         1.44%      24.468us         1.80%      30.638us       5.106us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.36%       6.170us         0.36%       6.170us       1.028us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        12.65%     214.994us        12.65%     214.994us      35.832us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.34%       5.830us         0.34%       5.830us       5.830us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     152.287us      1195.72%     152.287us     152.287us             1  
+                                            torch_eager         6.75%     128.682us        99.76%       1.902ms       1.902ms       0.000us         0.00%      14.944us      14.944us             1  
+                                             aten::silu         2.22%      42.301us        89.12%       1.699ms     566.261us       6.560us        51.51%       8.768us       2.923us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.560us        51.51%       6.560us       2.187us             3  
+                                              aten::mul         1.34%      25.502us         2.28%      43.392us      14.464us       6.176us        48.49%       6.176us       2.059us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.176us        48.49%       6.176us       2.059us             3  
+                                Activity Buffer Request        74.83%       1.427ms        74.83%       1.427ms       1.427ms       2.208us        17.34%       2.208us       2.208us             1  
+                                            aten::slice         1.32%      25.141us         1.61%      30.781us       5.130us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.30%       5.640us         0.30%       5.640us       0.940us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        13.00%     247.856us        13.00%     247.856us      41.309us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.24%       4.611us         0.24%       4.611us       4.611us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.700ms
-Self CUDA time total: 12.704us
+Self CPU time total: 1.906ms
+Self CUDA time total: 12.736us
 
 
 
@@ -4062,20 +4070,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     157.053us      1185.48%     157.053us     157.053us             1  
-                                            torch_eager         6.08%     111.294us        99.69%       1.824ms       1.824ms       0.000us         0.00%      15.552us      15.552us             1  
-                                             aten::silu         2.39%      43.729us        89.42%       1.636ms     545.306us       6.784us        51.21%       9.088us       3.029us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.784us        51.21%       6.784us       2.261us             3  
-                                              aten::mul         1.44%      26.361us         2.52%      46.181us      15.394us       6.464us        48.79%       6.464us       2.155us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.464us        48.79%       6.464us       2.155us             3  
-                                Activity Buffer Request        77.97%       1.426ms        77.97%       1.426ms       1.426ms       2.304us        17.39%       2.304us       2.304us             1  
-                                            aten::slice         1.34%      24.571us         1.66%      30.441us       5.074us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.32%       5.870us         0.32%       5.870us       0.978us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        10.14%     185.544us        10.14%     185.544us      30.924us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.31%       5.601us         0.31%       5.601us       5.601us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     153.054us      1155.39%     153.054us     153.054us             1  
+                                            torch_eager         6.42%     122.793us        99.75%       1.907ms       1.907ms       0.000us         0.00%      15.518us      15.518us             1  
+                                             aten::silu         2.19%      41.952us        89.33%       1.708ms     569.191us       6.751us        50.96%       9.022us       3.007us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.751us        50.96%       6.751us       2.250us             3  
+                                              aten::mul         1.27%      24.330us         2.36%      45.101us      15.034us       6.496us        49.04%       6.496us       2.165us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.496us        49.04%       6.496us       2.165us             3  
+                                Activity Buffer Request        76.06%       1.454ms        76.06%       1.454ms       1.454ms       2.271us        17.14%       2.271us       2.271us             1  
+                                            aten::slice         1.34%      25.570us         1.64%      31.330us       5.222us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.30%       5.760us         0.30%       5.760us       0.960us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        12.16%     232.387us        12.16%     232.387us      38.731us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.25%       4.840us         0.25%       4.840us       4.840us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.829ms
-Self CUDA time total: 13.248us
+Self CPU time total: 1.912ms
+Self CUDA time total: 13.247us
 
 
 
@@ -4085,20 +4093,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     151.390us       977.47%     151.390us     151.390us             1  
-                                            torch_eager        22.03%     109.975us        99.02%     494.363us     494.363us       0.000us         0.00%      18.176us      18.176us             1  
-                                             aten::silu         8.41%      41.971us        61.88%     308.937us     102.979us       7.936us        51.24%      10.624us       3.541us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.936us        51.24%       7.936us       2.645us             3  
-                                              aten::mul         5.23%      26.101us         8.92%      44.531us      14.844us       7.552us        48.76%       7.552us       2.517us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.552us        48.76%       7.552us       2.517us             3  
-                                Activity Buffer Request        22.19%     110.773us        22.19%     110.773us     110.773us       2.688us        17.36%       2.688us       2.688us             1  
-                                            aten::slice         5.05%      25.220us         6.19%      30.920us       5.153us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         1.14%       5.700us         1.14%       5.700us       0.950us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        34.98%     174.623us        34.98%     174.623us      29.104us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.98%       4.900us         0.98%       4.900us       4.900us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     159.743us      1029.27%     159.743us     159.743us             1  
+                                            torch_eager         7.04%     135.613us        99.74%       1.921ms       1.921ms       0.000us         0.00%      18.208us      18.208us             1  
+                                             aten::silu         2.22%      42.702us        88.66%       1.708ms     569.181us       7.936us        51.13%      10.624us       3.541us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.936us        51.13%       7.936us       2.645us             3  
+                                              aten::mul         1.46%      28.181us         2.39%      45.941us      15.314us       7.584us        48.87%       7.584us       2.528us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.584us        48.87%       7.584us       2.528us             3  
+                                Activity Buffer Request        75.65%       1.457ms        75.65%       1.457ms       1.457ms       2.688us        17.32%       2.688us       2.688us             1  
+                                            aten::slice         1.35%      26.081us         1.66%      31.951us       5.325us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.30%       5.870us         0.30%       5.870us       0.978us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        11.71%     225.495us        11.71%     225.495us      37.582us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.26%       4.960us         0.26%       4.960us       4.960us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 499.263us
-Self CUDA time total: 15.488us
+Self CPU time total: 1.926ms
+Self CUDA time total: 15.520us
 
 
 
@@ -4108,20 +4116,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D768
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     163.583us      1143.70%     163.583us     163.583us             1  
-                                            torch_eager         6.28%     116.052us        99.70%       1.841ms       1.841ms       0.000us         0.00%      16.767us      16.767us             1  
-                                             aten::silu         2.27%      41.942us        89.09%       1.645ms     548.450us       7.327us        51.23%       9.791us       3.264us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.327us        51.23%       7.327us       2.442us             3  
-                                              aten::mul         1.55%      28.681us         2.62%      48.392us      16.131us       6.976us        48.77%       6.976us       2.325us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.976us        48.77%       6.976us       2.325us             3  
-                                Activity Buffer Request        78.22%       1.445ms        78.22%       1.445ms       1.445ms       2.464us        17.23%       2.464us       2.464us             1  
-                                            aten::slice         1.38%      25.430us         1.70%      31.392us       5.232us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.32%       5.962us         0.32%       5.962us       0.994us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         9.67%     178.614us         9.67%     178.614us      29.769us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.30%       5.570us         0.30%       5.570us       5.570us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     156.031us      1088.46%     156.031us     156.031us             1  
+                                            torch_eager         6.78%     127.672us        99.74%       1.878ms       1.878ms       0.000us         0.00%      16.798us      16.798us             1  
+                                             aten::silu         2.24%      42.252us        88.75%       1.671ms     556.944us       7.327us        51.11%       9.790us       3.263us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.327us        51.11%       7.327us       2.442us             3  
+                                              aten::mul         1.40%      26.401us         2.46%      46.222us      15.407us       7.008us        48.89%       7.008us       2.336us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.008us        48.89%       7.008us       2.336us             3  
+                                Activity Buffer Request        75.83%       1.428ms        75.83%       1.428ms       1.428ms       2.463us        17.18%       2.463us       2.463us             1  
+                                            aten::slice         1.43%      26.941us         1.75%      32.941us       5.490us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.32%       6.000us         0.32%       6.000us       1.000us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        11.73%     220.885us        11.73%     220.885us      36.814us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.26%       4.871us         0.26%       4.871us       4.871us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.847ms
-Self CUDA time total: 14.303us
+Self CPU time total: 1.883ms
+Self CUDA time total: 14.335us
 
 
 
@@ -4131,20 +4139,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     150.172us       969.60%     150.172us     150.172us             1  
-                                            torch_eager        23.07%     110.204us        98.98%     472.752us     472.752us       0.000us         0.00%      18.176us      18.176us             1  
-                                             aten::silu         9.08%      43.371us        60.20%     287.547us      95.849us       7.936us        51.24%      10.624us       3.541us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.936us        51.24%       7.936us       2.645us             3  
-                                              aten::mul         5.48%      26.181us         9.38%      44.801us      14.934us       7.552us        48.76%       7.552us       2.517us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.552us        48.76%       7.552us       2.517us             3  
-                                Activity Buffer Request        19.26%      92.002us        19.26%      92.002us      92.002us       2.688us        17.36%       2.688us       2.688us             1  
-                                            aten::slice         5.00%      23.870us         6.32%      30.200us       5.033us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         1.33%       6.330us         1.33%       6.330us       1.055us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        35.76%     170.794us        35.76%     170.794us      28.466us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         1.02%       4.871us         1.02%       4.871us       4.871us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     151.072us       971.40%     151.072us     151.072us             1  
+                                            torch_eager         5.82%     108.433us        99.72%       1.859ms       1.859ms       0.000us         0.00%      18.240us      18.240us             1  
+                                             aten::silu         2.20%      40.971us        89.83%       1.675ms     558.344us       7.968us        51.23%      10.656us       3.552us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.968us        51.23%       7.968us       2.656us             3  
+                                              aten::mul         1.42%      26.501us         2.46%      45.902us      15.301us       7.584us        48.77%       7.584us       2.528us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.584us        48.77%       7.584us       2.528us             3  
+                                Activity Buffer Request        76.88%       1.433ms        76.88%       1.433ms       1.433ms       2.688us        17.28%       2.688us       2.688us             1  
+                                            aten::slice         1.31%      24.441us         1.61%      29.960us       4.993us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.30%       5.519us         0.30%       5.519us       0.920us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        11.80%     219.996us        11.80%     219.996us      36.666us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.28%       5.300us         0.28%       5.300us       5.300us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 477.623us
-Self CUDA time total: 15.488us
+Self CPU time total: 1.865ms
+Self CUDA time total: 15.552us
 
 
 
@@ -4154,20 +4162,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     160.000us       713.30%     160.000us     160.000us             1  
-                                            torch_eager         5.99%     109.975us        99.73%       1.831ms       1.831ms       0.000us         0.00%      26.335us      26.335us             1  
-                                             aten::silu         2.30%      42.230us        89.52%       1.643ms     547.763us      11.583us        51.64%      15.487us       5.162us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      11.583us        51.64%      11.583us       3.861us             3  
-                                              aten::mul         1.54%      28.250us         2.52%      46.180us      15.393us      10.848us        48.36%      10.848us       3.616us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.848us        48.36%      10.848us       3.616us             3  
-                                Activity Buffer Request        78.83%       1.447ms        78.83%       1.447ms       1.447ms       3.904us        17.40%       3.904us       3.904us             1  
-                                            aten::slice         1.37%      25.211us         1.70%      31.261us       5.210us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.33%       6.050us         0.33%       6.050us       1.008us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         9.37%     171.964us         9.37%     171.964us      28.661us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.27%       4.930us         0.27%       4.930us       4.930us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     157.150us       692.69%     157.150us     157.150us             1  
+                                            torch_eager         5.73%     107.203us        99.74%       1.865ms       1.865ms       0.000us         0.00%      26.622us      26.622us             1  
+                                             aten::silu         2.21%      41.231us        89.87%       1.680ms     560.117us      11.647us        51.34%      15.582us       5.194us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      11.647us        51.34%      11.647us       3.882us             3  
+                                              aten::mul         1.38%      25.882us         2.47%      46.192us      15.397us      11.040us        48.66%      11.040us       3.680us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      11.040us        48.66%      11.040us       3.680us             3  
+                                Activity Buffer Request        77.17%       1.443ms        77.17%       1.443ms       1.443ms       3.935us        17.34%       3.935us       3.935us             1  
+                                            aten::slice         1.37%      25.600us         1.67%      31.160us       5.193us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.30%       5.560us         0.30%       5.560us       0.927us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        11.58%     216.535us        11.58%     216.535us      36.089us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.26%       4.830us         0.26%       4.830us       4.830us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.836ms
-Self CUDA time total: 22.431us
+Self CPU time total: 1.870ms
+Self CUDA time total: 22.687us
 
 
 impl                     wl                  p50(ms)  ok
@@ -4181,12 +4189,6 @@ torch_eager              cuda_T512_D1024        0.05  True
 torch_eager              cuda_T512_D2048        0.05  True
 torch_eager              cuda_T512_D768         0.05  True
 
-
-
▶ UV Install Logs
- -

Artifacts:

activation.jsonl diff --git a/activation/results/artifacts/combine/latency.svg b/activation/results/artifacts/combine/latency.svg index b809b51f58837145ae3fdbcb04aa1aec4a5e023e..d52b2c0b2f83fd998cb1b5431865b3b7d258b0df 100644 --- a/activation/results/artifacts/combine/latency.svg +++ b/activation/results/artifacts/combine/latency.svg @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f62c7d85fc4a76cf7a1060a62df99ff0d32133ab94bb502b68dcd53171c39602 -size 21424 +oid sha256:29b9e8bb5a372481457939e6eee0f747e53209886137e7247a5b8d98423c5492 +size 20645 diff --git a/activation/results/combined_results.html b/activation/results/combined_results.html index 35064093e9085dbed21e2edd8a0a4e6c497bbb9d..a833d69bc0c276816c01310e2597b3e819179c89 100644 --- a/activation/results/combined_results.html +++ b/activation/results/combined_results.html @@ -809,6 +809,14 @@ .artifact-preview svg { background: transparent; } + /* Invert SVG images in dark mode */ + :root[data-theme="dark"] .artifact-preview img[src$=".svg"] { + filter: invert(0.9) hue-rotate(180deg); + } + /* Keep SVG images readable in monocolor mode */ + :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] { + filter: none; + } /* CSV table styling */ .artifact-csv { margin-top: 1rem; @@ -3872,7 +3880,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: - 2025-10-29T14:27:49.999657 + 2025-10-29T15:51:13.643076 image/svg+xml @@ -4021,96 +4029,83 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: - + - + - 0.025 + 0.025 - + - + - 0.030 + 0.030 - + - + - 0.035 + 0.035 - + - + - 0.040 + 0.040 - + - + - 0.045 + 0.045 - + - + - 0.050 - - - - - - - - - - - - - 0.055 + 0.050 @@ -4118,37 +4113,37 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: - + - - - - - - - - + + + + + + + + - + - + - - - - - - - + + + + + + + @@ -4163,14 +4158,14 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: - + Attention Implementation Latency - + @@ -4179,7 +4174,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: hf_kernels_swiglu - + @@ -4206,7 +4201,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: combine | 4.24s +Cell: combine | 4.26s | Raw @@ -4345,7 +4340,7 @@ Installed 37 packages in 218ms - 2025-10-29T14:27:49.999657 + 2025-10-29T15:51:13.643076 image/svg+xml @@ -4494,96 +4489,83 @@ Installed 37 packages in 218ms - + - + - 0.025 + 0.025 - + - + - 0.030 + 0.030 - + - + - 0.035 + 0.035 - + - + - 0.040 + 0.040 - + - + - 0.045 + 0.045 - + - + - 0.050 - - - - - - - - - - - - - 0.055 + 0.050 @@ -4591,37 +4573,37 @@ Installed 37 packages in 218ms - + - - - - - - - - + + + + + + + + - + - + - - - - - - - + + + + + + + @@ -4636,14 +4618,14 @@ Installed 37 packages in 218ms - + Attention Implementation Latency - + @@ -4652,7 +4634,7 @@ Installed 37 packages in 218ms hf_kernels_swiglu - + diff --git a/causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl b/causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl index 3c3e9cb1937f70bc8a6005f64424ae1ae23f373f..94aedaebb02d5899bc4806cc445e41e3cbec9de2 100644 --- a/causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl +++ b/causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl @@ -1,24 +1,24 @@ -{"ts": "2025-10-29T14:27:35Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S128_W2", "batch": 2, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.047051000024112, "p50": 0.048391000007086404, "p90": 0.048571999968771706, "mean": 0.04890720000503279, "iqr": 0.0005509999709829572, "raw_times": [0.0525010000274051, 0.048571999968771706, 0.04802099999778875, 0.048391000007086404, 0.047051000024112], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.058991999992485944, "peak_bytes": 295936, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S128_W4", "batch": 2, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05480199996554802, "p50": 0.05610099998420992, "p90": 0.05628099995647062, "mean": 0.056069199968078465, "iqr": 0.0006299999881775875, "raw_times": [0.057510999965870724, 0.05628099995647062, 0.05610099998420992, 0.055650999968293036, 0.05480199996554802], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.06025200002568454, "peak_bytes": 296448, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S512_W2", "batch": 2, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05469199999197372, "p50": 0.056971999981669796, "p90": 0.057361000017408514, "mean": 0.056363600003805914, "iqr": 0.0025490000439276628, "raw_times": [0.05469199999197372, 0.057361000017408514, 0.056971999981669796, 0.05798100005449669, 0.05481199997348085], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0596809999819925, "peak_bytes": 1180672, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S512_W4", "batch": 2, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05416100003685642, "p50": 0.05502099998011545, "p90": 0.05511200004093553, "mean": 0.05489540000098714, "iqr": 0.00016000007008187822, "raw_times": [0.05416100003685642, 0.05495199997085365, 0.055230999976174644, 0.05511200004093553, 0.05502099998011545], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05659100003185813, "peak_bytes": 1181184, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S2048_W2", "batch": 2, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052401000004920206, "p50": 0.055000999964249786, "p90": 0.056301000029179704, "mean": 0.054810999995424936, "iqr": 0.0023400000372930663, "raw_times": [0.052401000004920206, 0.056301000029179704, 0.056390999986888346, 0.05396099999188664, 0.055000999964249786], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05838100003074942, "peak_bytes": 4719616, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S2048_W4", "batch": 2, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05270100001553146, "p50": 0.05342100001826111, "p90": 0.054111999986616865, "mean": 0.053611199996339565, "iqr": 0.0008709999974598759, "raw_times": [0.05324099998915699, 0.054580999972131394, 0.054111999986616865, 0.05342100001826111, 0.05270100001553146], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0580610000042725, "peak_bytes": 4720128, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S128_W2", "batch": 2, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.051810999991630524, "p50": 0.05310099999178419, "p90": 0.053301000036753976, "mean": 0.05577720000928821, "iqr": 0.0007700000423938036, "raw_times": [0.051810999991630524, 0.05253099999436017, 0.06814200003191218, 0.053301000036753976, 0.05310099999178419], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056801999960498506, "peak_bytes": 9461760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S128_W4", "batch": 2, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.053270999956112064, "p50": 0.05397199998924407, "p90": 0.05399100001568513, "mean": 0.054061200000887766, "iqr": 0.00023000001192485797, "raw_times": [0.05399100001568513, 0.05531100003963729, 0.053270999956112064, 0.05397199998924407, 0.05376100000376027], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056641000014678866, "peak_bytes": 9478144, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S512_W2", "batch": 2, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052231000040592335, "p50": 0.052561000018158666, "p90": 0.0526110000009794, "mean": 0.0529970000116009, "iqr": 0.0002200000039920269, "raw_times": [0.052231000040592335, 0.052390999996987375, 0.05519100000128674, 0.052561000018158666, 0.0526110000009794], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05621100001462764, "peak_bytes": 37773312, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S512_W4", "batch": 2, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052561000018158666, "p50": 0.053772000001117704, "p90": 0.05471100001841478, "mean": 0.057879400003457704, "iqr": 0.0021000000174353772, "raw_times": [0.0526110000009794, 0.05471100001841478, 0.053772000001117704, 0.052561000018158666, 0.07574199997861797], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.06976200000963217, "peak_bytes": 37789696, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S2048_W2", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.051341000016691396, "p50": 0.05308099997591853, "p90": 0.05349200000637211, "mean": 0.05272739998645193, "iqr": 0.0014310000437944836, "raw_times": [0.052060999962577625, 0.05366199997069998, 0.05308099997591853, 0.051341000016691396, 0.05349200000637211], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05661099999088037, "peak_bytes": 151019520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S2048_W4", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05081099999415528, "p50": 0.053202000003693684, "p90": 0.05362099994954406, "mean": 0.05282339998302632, "iqr": 0.0011499999459374521, "raw_times": [0.05081099999415528, 0.052471000003606605, 0.05362099994954406, 0.054011999964131974, 0.053202000003693684], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05642200000011144, "peak_bytes": 151035904, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S128_W2", "batch": 4, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052221000032659504, "p50": 0.05397099999981947, "p90": 0.05448100000648992, "mean": 0.05380100001275423, "iqr": 0.001479999980347202, "raw_times": [0.052221000032659504, 0.05300100002614272, 0.055330999998659536, 0.05448100000648992, 0.05397099999981947], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.058421999995061924, "peak_bytes": 33727488, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S128_W4", "batch": 4, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05164199995988383, "p50": 0.053460999993149017, "p90": 0.05421100001967716, "mean": 0.0532791999944493, "iqr": 0.00139000002263856, "raw_times": [0.0528209999970386, 0.054261000002497894, 0.05421100001967716, 0.05164199995988383, 0.053460999993149017], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05741199998965385, "peak_bytes": 591360, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S512_W2", "batch": 4, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052460999995673774, "p50": 0.0557109999590466, "p90": 0.05585100001326282, "mean": 0.05600519999688913, "iqr": 0.002330000029360235, "raw_times": [0.052460999995673774, 0.0557109999590466, 0.05585100001326282, 0.053520999983902584, 0.06248200003255988], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.058421000005637325, "peak_bytes": 2360320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S512_W4", "batch": 4, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0540510000064387, "p50": 0.0541219999945497, "p90": 0.05425200004083308, "mean": 0.054337400013082515, "iqr": 0.0001910000264615519, "raw_times": [0.0540510000064387, 0.05425200004083308, 0.05520100000921957, 0.0541219999945497, 0.05406100001437153], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05772200000819794, "peak_bytes": 2360832, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S2048_W2", "batch": 4, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05302099998516496, "p50": 0.05408099997339377, "p90": 0.0544409999747586, "mean": 0.05571119997966889, "iqr": 0.0008299999763039523, "raw_times": [0.05302099998516496, 0.0544409999747586, 0.06340199996657248, 0.053610999998454645, 0.05408099997339377], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05787100002407897, "peak_bytes": 9438208, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S2048_W4", "batch": 4, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052391999986411975, "p50": 0.05292200000894809, "p90": 0.053131000015582686, "mean": 0.05318180001268047, "iqr": 0.00026899999738816405, "raw_times": [0.053131000015582686, 0.054602000034265075, 0.052391999986411975, 0.05286200001819452, 0.05292200000894809], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05755099999760205, "peak_bytes": 9438720, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S128_W2", "batch": 4, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05195099998900332, "p50": 0.05479100002503401, "p90": 0.05620100000669481, "mean": 0.05852919999824735, "iqr": 0.0034000000255218765, "raw_times": [0.07690199998933167, 0.05620100000669481, 0.05479100002503401, 0.05195099998900332, 0.052800999981172936], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.057030999982998765, "peak_bytes": 18931712, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S128_W4", "batch": 4, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05290100000365783, "p50": 0.05457100002104198, "p90": 0.055251000048883725, "mean": 0.055353400011881604, "iqr": 0.001779000058377278, "raw_times": [0.05290100000365783, 0.06057199999531804, 0.055251000048883725, 0.05347199999050645, 0.05457100002104198], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056300999972336285, "peak_bytes": 18948096, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S512_W2", "batch": 4, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052152000023397704, "p50": 0.05461199998535449, "p90": 0.05471200000783938, "mean": 0.05381760001910152, "iqr": 0.001900999961890193, "raw_times": [0.052152000023397704, 0.05461199998535449, 0.05480100003296684, 0.05471200000783938, 0.052811000045949186], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05758200001082514, "peak_bytes": 75522048, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S512_W4", "batch": 4, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05339099999446262, "p50": 0.05463100001179555, "p90": 0.055171999974845676, "mean": 0.05451339999353877, "iqr": 0.0011309999763398082, "raw_times": [0.05404099999850587, 0.055331999988084135, 0.05339099999446262, 0.05463100001179555, 0.055171999974845676], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.058501000012256554, "peak_bytes": 75538432, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S2048_W2", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05277100001421786, "p50": 0.053860999969401746, "p90": 0.054361000024982786, "mean": 0.053951200004576094, "iqr": 0.0007190000133050489, "raw_times": [0.05277100001421786, 0.05512100000260034, 0.05364200001167774, 0.053860999969401746, 0.054361000024982786], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.057851999997637904, "peak_bytes": 302014464, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S2048_W4", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05124100005104992, "p50": 0.053691000005073874, "p90": 0.054261999991922494, "mean": 0.05327740001348502, "iqr": 0.0014609999539061391, "raw_times": [0.05124100005104992, 0.05439199998136246, 0.054261999991922494, 0.053691000005073874, 0.052801000038016355], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05714199994599767, "peak_bytes": 302030848, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S128_W2", "batch": 2, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07023200004141472, "p50": 0.07095199998730095, "p90": 0.07123199998204655, "mean": 0.07353400000056354, "iqr": 0.0008999999749903509, "raw_times": [0.07095199998730095, 0.08492199998499927, 0.0703320000070562, 0.07123199998204655, 0.07023200004141472], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.07603100004871521, "peak_bytes": 295936, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S128_W4", "batch": 2, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08460200001536577, "p50": 0.08611200001951147, "p90": 0.08698200002754675, "mean": 0.08602200001632809, "iqr": 0.001740000016070553, "raw_times": [0.08460200001536577, 0.08611200001951147, 0.08698200002754675, 0.08717200000774028, 0.0852420000114762], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08820200002901402, "peak_bytes": 296448, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S512_W2", "batch": 2, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08334199998216718, "p50": 0.08516200000485696, "p90": 0.08565199999566175, "mean": 0.08509399999638845, "iqr": 0.0015599999869664316, "raw_times": [0.08334199998216718, 0.08722199999056102, 0.08565199999566175, 0.08516200000485696, 0.08409200000869532], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0867219999918234, "peak_bytes": 1180672, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S512_W4", "batch": 2, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08196199996746145, "p50": 0.08375099997692814, "p90": 0.08384200003774822, "mean": 0.08337179999671207, "iqr": 0.0010800000609378912, "raw_times": [0.08276199997681033, 0.08454200002461221, 0.08384200003774822, 0.08375099997692814, 0.08196199996746145], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08716199999980745, "peak_bytes": 1181184, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S2048_W2", "batch": 2, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08276199997681033, "p50": 0.08335200004694343, "p90": 0.08474200001273857, "mean": 0.08374400000548121, "iqr": 0.0019199999883312557, "raw_times": [0.08335200004694343, 0.08474200001273857, 0.08504199996650641, 0.08276199997681033, 0.08282200002440732], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08652200000369703, "peak_bytes": 4719616, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S2048_W4", "batch": 2, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08203199996614785, "p50": 0.08333200003107777, "p90": 0.08342199998878641, "mean": 0.08316619998822716, "iqr": 0.0006700000199089118, "raw_times": [0.08203199996614785, 0.08333200003107777, 0.08342199998878641, 0.0827519999688775, 0.08429299998624629], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08916199999475793, "peak_bytes": 4720128, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S128_W2", "batch": 2, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08283200003234015, "p50": 0.08409299999811992, "p90": 0.08469200002991784, "mean": 0.08781020001151774, "iqr": 0.001050000037139398, "raw_times": [0.08469200002991784, 0.08409299999811992, 0.10379200000443234, 0.08364199999277844, 0.08283200003234015], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08838200000127472, "peak_bytes": 9461760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S128_W4", "batch": 2, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08139199997003743, "p50": 0.08336199999803284, "p90": 0.08399199998621043, "mean": 0.0832759999980226, "iqr": 0.0010599999313853914, "raw_times": [0.08139199997003743, 0.08470199998100725, 0.08293200005482504, 0.08336199999803284, 0.08399199998621043], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08715199999187462, "peak_bytes": 9478144, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S512_W2", "batch": 2, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08235199999262477, "p50": 0.08327199998348078, "p90": 0.0835210000218467, "mean": 0.08336580000332106, "iqr": 0.00033899999607456266, "raw_times": [0.08235199999262477, 0.0835210000218467, 0.08450199999288088, 0.08327199998348078, 0.08318200002577214], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08735199998000098, "peak_bytes": 37773312, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S512_W4", "batch": 2, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08233200003360253, "p50": 0.08335199999010001, "p90": 0.08342199998878641, "mean": 0.08314600000858263, "iqr": 0.0004799999828719592, "raw_times": [0.08342199998878641, 0.08233200003360253, 0.08335199999010001, 0.08294200000591445, 0.08368200002450976], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08666200000106983, "peak_bytes": 37789696, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S2048_W2", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.1449639999577812, "p50": 0.14544300000807198, "p90": 0.14571399998430934, "mean": 0.14548759999115646, "iqr": 0.00032100001590151805, "raw_times": [0.14544300000807198, 0.1449639999577812, 0.14539299996840782, 0.14571399998430934, 0.14592400003721195], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.14803300001631214, "peak_bytes": 151019520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S2048_W4", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.16181400002324153, "p50": 0.1630739999995967, "p90": 0.16360400002213282, "mean": 0.16567200000281446, "iqr": 0.0017800000478018774, "raw_times": [0.16181400002324153, 0.17804399999477027, 0.1630739999995967, 0.16182399997433095, 0.16360400002213282], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.16251400001010552, "peak_bytes": 151035904, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S128_W2", "batch": 4, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08132199997135103, "p50": 0.08263099999794576, "p90": 0.08295200001384728, "mean": 0.0824317999899904, "iqr": 0.0009100000397666008, "raw_times": [0.08132199997135103, 0.08321199999272721, 0.08204199997408068, 0.08295200001384728, 0.08263099999794576], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08486200005108913, "peak_bytes": 33727488, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S128_W4", "batch": 4, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08171299998593895, "p50": 0.08253199996488547, "p90": 0.08321199999272721, "mean": 0.08254819997546292, "iqr": 0.001280000049064256, "raw_times": [0.08193199994366296, 0.08335199999010001, 0.08321199999272721, 0.08171299998593895, 0.08253199996488547], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08592199998247452, "peak_bytes": 591360, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S512_W2", "batch": 4, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08148200004143291, "p50": 0.08176199997933509, "p90": 0.08353199996236071, "mean": 0.08242180000479493, "iqr": 0.002030999951330159, "raw_times": [0.08150100001103056, 0.08148200004143291, 0.08383200002981539, 0.08176199997933509, 0.08353199996236071], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08680199999844262, "peak_bytes": 2360320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S512_W4", "batch": 4, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08221299998467657, "p50": 0.08294200000591445, "p90": 0.08321200004957063, "mean": 0.08299800000486357, "iqr": 0.0007910000476840651, "raw_times": [0.08242100000188657, 0.08420199998226963, 0.08321200004957063, 0.08221299998467657, 0.08294200000591445], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08663200003411475, "peak_bytes": 2360832, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S2048_W2", "batch": 4, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08301200000460085, "p50": 0.08371199999146484, "p90": 0.08385299997826223, "mean": 0.08369219999622146, "iqr": 0.0001610000026630587, "raw_times": [0.08301200000460085, 0.08371199999146484, 0.08419200003118021, 0.08385299997826223, 0.08369199997559917], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.086651999993137, "peak_bytes": 9438208, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S2048_W4", "batch": 4, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08138200001894802, "p50": 0.08318200002577214, "p90": 0.08328199999141361, "mean": 0.08309020000751843, "iqr": 0.0010899999551838846, "raw_times": [0.08219200003622973, 0.08541299996522866, 0.08318200002577214, 0.08138200001894802, 0.08328199999141361], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08645299999443523, "peak_bytes": 9438720, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S128_W2", "batch": 4, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0822520000269833, "p50": 0.08321100000330262, "p90": 0.08357199999409204, "mean": 0.08451599999261816, "iqr": 0.0009600000225873373, "raw_times": [0.09093299996720816, 0.0822520000269833, 0.08321100000330262, 0.0826119999715047, 0.08357199999409204], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08730200005402367, "peak_bytes": 18931712, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S128_W4", "batch": 4, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08279200000060882, "p50": 0.08370200004037542, "p90": 0.08400199999414326, "mean": 0.08373800000072151, "iqr": 0.0006500000040432496, "raw_times": [0.08335199999010001, 0.08400199999414326, 0.08484199997838004, 0.08279200000060882, 0.08370200004037542], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08856199997353542, "peak_bytes": 18948096, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S512_W2", "batch": 4, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09200200003078862, "p50": 0.09372200003099351, "p90": 0.09380200003761274, "mean": 0.09347200001457168, "iqr": 0.00012000003835055395, "raw_times": [0.09200200003078862, 0.09415199997420132, 0.09380200003761274, 0.09372200003099351, 0.09368199999926219], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09485200001790872, "peak_bytes": 75522048, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S512_W4", "batch": 4, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.098961999981384, "p50": 0.10011200004100829, "p90": 0.10014200000796336, "mean": 0.10138220001181253, "iqr": 0.0004400000079840538, "raw_times": [0.09970199999997931, 0.098961999981384, 0.10011200004100829, 0.10014200000796336, 0.10799300002872769], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.11010200000782788, "peak_bytes": 75538432, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S2048_W2", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.48627099999976053, "p50": 0.48646099997995407, "p90": 0.4873609999549444, "mean": 0.48691319999534244, "iqr": 0.00103899992609513, "raw_times": [0.48627099999976053, 0.4873609999549444, 0.4881510000132039, 0.4863220000288493, 0.48646099997995407], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.48353100004305816, "peak_bytes": 302014464, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T15:50:27Z", "run": "73b51cf0e9b74d0eaee1c67a266218ef", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S2048_W4", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.491851000049337, "p50": 0.49710199999708493, "p90": 0.49729199997727846, "mean": 0.49653980000812226, "iqr": 0.0012099999935344385, "raw_times": [0.496081999983744, 0.5003720000331668, 0.49729199997727846, 0.49710199999708493, 0.491851000049337], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.5018319999976484, "peak_bytes": 302030848, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} diff --git a/causal_conv1d/impls/cells/benchmark.py b/causal_conv1d/impls/cells/benchmark.py index 725b12c4018e4eec05c5ddccb0c88a8eae6f150d..2e38669a505cbdf181a93e97f31ed1e67ecf4883 100644 --- a/causal_conv1d/impls/cells/benchmark.py +++ b/causal_conv1d/impls/cells/benchmark.py @@ -4,28 +4,37 @@ # "numpy", # "torch==2.8.0", # "kernels-benchmark-tools", -# "kernels", # ] # # [tool.uv.sources] # kernels-benchmark-tools = { path = "../../../../../tools", editable = true } # /// import torch +import torch.nn.functional as F import sys from kernels_benchmark_tools import KernelTypeEnum, run_benchmark -from kernels import get_kernel -# Load the causal conv1d kernel -causal_conv1d = get_kernel("kernels-community/causal-conv1d") +def torch_causal_conv1d(input_tensor, weight, bias): + # Convert to weight dtype for computation + x = input_tensor.to(weight.dtype) + dim = weight.shape[0] + width = weight.shape[1] + seqlen = input_tensor.shape[-1] -def hf_kernels_causal_conv1d(input_tensor, weight, bias): - return causal_conv1d.causal_conv1d_fn(input_tensor, weight, bias) + # Depthwise causal conv1d using PyTorch + out = F.conv1d(x, weight.unsqueeze(1), bias, padding=width - 1, groups=dim) + + # Truncate to original sequence length + out = out[..., :seqlen] + + # Convert back to original dtype + return out.to(input_tensor.dtype) run_benchmark( kernel_type=KernelTypeEnum.CAUSAL_CONV1D, - impl_name="hf_kernels_causal_conv1d", - impl_tags={"family": "hf-kernels", "backend": "cuda"}, - impl_func=hf_kernels_causal_conv1d, + impl_name="torch_eager", + impl_tags={"family": "pytorch", "backend": "eager"}, + impl_func=torch_causal_conv1d, ) \ No newline at end of file diff --git a/causal_conv1d/impls/hf_kernels_causal_conv1d.html b/causal_conv1d/impls/hf_kernels_causal_conv1d.html index 025d1f7d39597f6702f2ef95b801eca2a6d706e8..7c575dae185fce03b36f53d2a34df1f9efa53dc7 100644 --- a/causal_conv1d/impls/hf_kernels_causal_conv1d.html +++ b/causal_conv1d/impls/hf_kernels_causal_conv1d.html @@ -809,6 +809,14 @@ .artifact-preview svg { background: transparent; } + /* Invert SVG images in dark mode */ + :root[data-theme="dark"] .artifact-preview img[src$=".svg"] { + filter: invert(0.9) hue-rotate(180deg); + } + /* Keep SVG images readable in monocolor mode */ + :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] { + filter: none; + } /* CSV table styling */ .artifact-csv { margin-top: 1rem; @@ -3871,7 +3879,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: nv | 0.24s +Cell: nv | 0.21s | Raw @@ -3887,7 +3895,7 @@ Cell: nv | 0.24s
-
Wed Oct 29 14:27:09 2025       
+
Wed Oct 29 15:50:16 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -3896,7 +3904,7 @@ Cell: nv | 0.24s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   33C    P0            109W /  350W |       0MiB /  46068MiB |    100%      Default |
+| N/A   29C    P0             78W /  350W |       0MiB /  46068MiB |     18%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3920,7 +3928,7 @@ Cell: nv | 0.24s
 ▼ output
  ▶ uv-logs
  | 
-Cell: benchmark | 5.79s
+Cell: benchmark | 9.51s
  | 
 
 Raw
@@ -3973,19 +3981,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     151.393us      3724.31%     151.393us     151.393us             1  
-                               hf_kernels_causal_conv1d         8.95%     166.324us        99.62%       1.852ms       1.852ms       0.000us         0.00%       5.505us       5.505us             1  
-                                         CausalConv1dFn         6.05%     112.563us        90.67%       1.686ms     561.934us       0.000us         0.00%       5.505us       1.835us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.41%      26.172us        80.97%       1.505ms     501.826us       4.065us       100.00%       5.505us       1.835us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.065us       100.00%       4.065us       1.355us             3  
-                                Activity Buffer Request        77.14%       1.434ms        77.14%       1.434ms       1.434ms       1.440us        35.42%       1.440us       1.440us             1  
-                                       aten::empty_like         1.03%      19.059us         3.64%      67.761us      22.587us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         2.62%      48.702us         2.62%      48.702us      16.234us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         2.42%      45.061us         2.42%      45.061us      15.020us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.38%       7.150us         0.38%       7.150us       7.150us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     146.174us      3568.70%     146.174us     146.174us             1  
+                               hf_kernels_causal_conv1d         8.17%     151.282us        99.60%       1.845ms       1.845ms       0.000us         0.00%       5.536us       5.536us             1  
+                                         CausalConv1dFn         5.96%     110.474us        91.44%       1.694ms     564.683us       0.000us         0.00%       5.536us       1.845us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.25%      23.111us        81.80%       1.516ms     505.182us       4.096us       100.00%       5.536us       1.845us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.096us       100.00%       4.096us       1.365us             3  
+                                Activity Buffer Request        78.05%       1.446ms        78.05%       1.446ms       1.446ms       1.440us        35.16%       1.440us       1.440us             1  
+                                       aten::empty_like         1.06%      19.700us         3.67%      68.031us      22.677us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         2.61%      48.331us         2.61%      48.331us      16.110us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         2.50%      46.381us         2.50%      46.381us      15.460us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.40%       7.370us         0.40%       7.370us       7.370us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.859ms
-Self CUDA time total: 4.065us
+Self CPU time total: 1.853ms
+Self CUDA time total: 4.096us
 
 
 
@@ -3995,19 +4003,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     129.439us      3456.32%     129.439us     129.439us             1  
-                               hf_kernels_causal_conv1d         5.79%      99.043us        99.68%       1.706ms       1.706ms       0.000us         0.00%       4.994us       4.994us             1  
-                                         CausalConv1dFn         4.71%      80.562us        93.90%       1.607ms     535.793us       0.000us         0.00%       4.994us       1.665us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.47%      25.130us        87.50%       1.498ms     499.285us       3.745us       100.00%       4.994us       1.665us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.745us       100.00%       3.745us       1.248us             3  
-                                Activity Buffer Request        84.17%       1.441ms        84.17%       1.441ms       1.441ms       1.249us        33.35%       1.249us       1.249us             1  
-                                       aten::empty_like         0.47%       7.980us         1.69%      28.961us       9.654us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.23%      20.981us         1.23%      20.981us       6.994us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         1.86%      31.821us         1.86%      31.821us      10.607us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.32%       5.430us         0.32%       5.430us       5.430us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     144.319us      3789.89%     144.319us     144.319us             1  
+                               hf_kernels_causal_conv1d         4.94%      83.592us        99.69%       1.687ms       1.687ms       0.000us         0.00%       5.088us       5.088us             1  
+                                         CausalConv1dFn         5.57%      94.202us        94.76%       1.604ms     534.586us       0.000us         0.00%       5.088us       1.696us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.53%      25.920us        87.50%       1.481ms     493.624us       3.808us       100.00%       5.088us       1.696us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.808us       100.00%       3.808us       1.269us             3  
+                                Activity Buffer Request        84.14%       1.424ms        84.14%       1.424ms       1.424ms       1.280us        33.61%       1.280us       1.280us             1  
+                                       aten::empty_like         0.45%       7.561us         1.69%      28.682us       9.561us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.25%      21.121us         1.25%      21.121us       7.040us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.83%      30.901us         1.83%      30.901us      10.300us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.31%       5.170us         0.31%       5.170us       5.170us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.712ms
-Self CUDA time total: 3.745us
+Self CPU time total: 1.693ms
+Self CUDA time total: 3.808us
 
 
 
@@ -4017,19 +4025,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     124.098us      3285.62%     124.098us     124.098us             1  
-                               hf_kernels_causal_conv1d         5.52%      95.683us        99.69%       1.728ms       1.728ms       0.000us         0.00%       5.057us       5.057us             1  
-                                         CausalConv1dFn         4.48%      77.582us        94.17%       1.632ms     544.020us       0.000us         0.00%       5.057us       1.686us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.43%      24.830us        87.99%       1.525ms     508.322us       3.777us       100.00%       5.057us       1.686us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.777us       100.00%       3.777us       1.259us             3  
-                                Activity Buffer Request        84.76%       1.469ms        84.76%       1.469ms       1.469ms       1.280us        33.89%       1.280us       1.280us             1  
-                                       aten::empty_like         0.46%       7.920us         1.70%      29.511us       9.837us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.25%      21.591us         1.25%      21.591us       7.197us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         1.80%      31.261us         1.80%      31.261us      10.420us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.31%       5.301us         0.31%       5.301us       5.301us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     118.942us      3149.95%     118.942us     118.942us             1  
+                               hf_kernels_causal_conv1d         4.70%      79.942us        99.69%       1.694ms       1.694ms       0.000us         0.00%       5.024us       5.024us             1  
+                                         CausalConv1dFn         4.32%      73.340us        94.98%       1.614ms     538.022us       0.000us         0.00%       5.024us       1.675us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.40%      23.852us        89.01%       1.513ms     504.182us       3.776us       100.00%       5.024us       1.675us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.776us       100.00%       3.776us       1.259us             3  
+                                Activity Buffer Request        85.86%       1.459ms        85.86%       1.459ms       1.459ms       1.248us        33.05%       1.248us       1.248us             1  
+                                       aten::empty_like         0.44%       7.502us         1.66%      28.182us       9.394us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.22%      20.680us         1.22%      20.680us       6.893us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.75%      29.690us         1.75%      29.690us       9.897us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.31%       5.340us         0.31%       5.340us       5.340us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.733ms
-Self CUDA time total: 3.777us
+Self CPU time total: 1.699ms
+Self CUDA time total: 3.776us
 
 
 
@@ -4039,19 +4047,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     129.729us      3378.36%     129.729us     129.729us             1  
-                               hf_kernels_causal_conv1d         5.03%      97.232us        99.72%       1.927ms       1.927ms       0.000us         0.00%       5.120us       5.120us             1  
-                                         CausalConv1dFn         4.11%      79.452us        94.69%       1.830ms     610.049us       0.000us         0.00%       5.120us       1.707us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.27%      24.481us        89.03%       1.721ms     573.588us       3.840us       100.00%       5.120us       1.707us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.840us       100.00%       3.840us       1.280us             3  
-                                Activity Buffer Request        76.40%       1.477ms        76.40%       1.477ms       1.477ms       1.280us        33.33%       1.280us       1.280us             1  
-                                       aten::empty_like         0.41%       7.951us         1.55%      29.931us       9.977us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.14%      21.980us         1.14%      21.980us       7.327us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        11.36%     219.575us        11.36%     219.575us      73.192us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.28%       5.490us         0.28%       5.490us       5.490us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     122.814us      3251.63%     122.814us     122.814us             1  
+                               hf_kernels_causal_conv1d         4.64%      85.642us        99.73%       1.840ms       1.840ms       0.000us         0.00%       5.025us       5.025us             1  
+                                         CausalConv1dFn         3.90%      72.023us        95.09%       1.754ms     584.757us       0.000us         0.00%       5.025us       1.675us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.39%      25.651us        89.62%       1.653ms     551.112us       3.777us       100.00%       5.025us       1.675us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.777us       100.00%       3.777us       1.259us             3  
+                                Activity Buffer Request        78.74%       1.453ms        78.74%       1.453ms       1.453ms       1.248us        33.04%       1.248us       1.248us             1  
+                                       aten::empty_like         0.42%       7.802us         1.57%      28.911us       9.637us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.14%      21.109us         1.14%      21.109us       7.036us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         9.48%     174.913us         9.48%     174.913us      58.304us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.27%       5.000us         0.27%       5.000us       5.000us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.933ms
-Self CUDA time total: 3.840us
+Self CPU time total: 1.845ms
+Self CUDA time total: 3.777us
 
 
 
@@ -4061,19 +4069,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     126.080us      2644.30%     126.080us     126.080us             1  
-                               hf_kernels_causal_conv1d         5.18%     102.863us        99.75%       1.979ms       1.979ms       0.000us         0.00%       6.368us       6.368us             1  
-                                         CausalConv1dFn         3.95%      78.303us        94.57%       1.876ms     625.402us       0.000us         0.00%       6.368us       2.123us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.22%      24.140us        89.14%       1.768ms     589.491us       4.768us       100.00%       6.368us       2.123us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.768us       100.00%       4.768us       1.589us             3  
-                                Activity Buffer Request        79.49%       1.577ms        79.49%       1.577ms       1.577ms       1.600us        33.56%       1.600us       1.600us             1  
-                                       aten::empty_like         0.40%       7.900us         1.48%      29.430us       9.810us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.09%      21.530us         1.09%      21.530us       7.177us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         8.43%     167.184us         8.43%     167.184us      55.728us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.25%       4.910us         0.25%       4.910us       4.910us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     123.008us      2597.30%     123.008us     123.008us             1  
+                               hf_kernels_causal_conv1d         4.59%      83.953us        99.73%       1.825ms       1.825ms       0.000us         0.00%       6.337us       6.337us             1  
+                                         CausalConv1dFn         3.99%      73.081us        95.14%       1.741ms     580.330us       0.000us         0.00%       6.337us       2.112us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.48%      27.090us        89.51%       1.638ms     546.026us       4.736us       100.00%       6.337us       2.112us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.736us       100.00%       4.736us       1.579us             3  
+                                Activity Buffer Request        78.87%       1.443ms        78.87%       1.443ms       1.443ms       1.601us        33.80%       1.601us       1.601us             1  
+                                       aten::empty_like         0.45%       8.280us         1.63%      29.831us       9.944us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.18%      21.551us         1.18%      21.551us       7.184us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         9.16%     167.714us         9.16%     167.714us      55.905us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.27%       5.030us         0.27%       5.030us       5.030us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.984ms
-Self CUDA time total: 4.768us
+Self CPU time total: 1.830ms
+Self CUDA time total: 4.736us
 
 
 
@@ -4083,19 +4091,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     121.055us      2488.80%     121.055us     121.055us             1  
-                               hf_kernels_causal_conv1d        13.09%      78.123us        99.20%     592.205us     592.205us       0.000us         0.00%       6.528us       6.528us             1  
-                                         CausalConv1dFn        13.01%      77.643us        86.11%     514.082us     171.361us       0.000us         0.00%       6.528us       2.176us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.18%      24.929us        68.36%     408.089us     136.030us       4.864us       100.00%       6.528us       2.176us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.864us       100.00%       4.864us       1.621us             3  
-                                Activity Buffer Request        36.63%     218.665us        36.63%     218.665us     218.665us       1.664us        34.21%       1.664us       1.664us             1  
-                                       aten::empty_like         1.31%       7.839us         4.75%      28.350us       9.450us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         3.44%      20.511us         3.44%      20.511us       6.837us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        27.55%     164.495us        27.55%     164.495us      54.832us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.80%       4.790us         0.80%       4.790us       4.790us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     113.566us      2381.84%     113.566us     113.566us             1  
+                               hf_kernels_causal_conv1d        13.06%      81.391us        99.15%     617.944us     617.944us       0.000us         0.00%       6.400us       6.400us             1  
+                                         CausalConv1dFn        11.13%      69.381us        86.09%     536.553us     178.851us       0.000us         0.00%       6.400us       2.133us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.09%      25.520us        70.57%     439.840us     146.613us       4.768us       100.00%       6.400us       2.133us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.768us       100.00%       4.768us       1.589us             3  
+                                Activity Buffer Request        39.92%     248.796us        39.92%     248.796us     248.796us       1.632us        34.23%       1.632us       1.632us             1  
+                                       aten::empty_like         1.16%       7.221us         4.39%      27.332us       9.111us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.23%      20.111us         3.23%      20.111us       6.704us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        26.56%     165.524us        26.56%     165.524us      55.175us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.85%       5.280us         0.85%       5.280us       5.280us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 596.995us
-Self CUDA time total: 4.864us
+Self CPU time total: 623.224us
+Self CUDA time total: 4.768us
 
 
 
@@ -4105,19 +4113,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     128.031us      1201.49%     128.031us     128.031us             1  
-                               hf_kernels_causal_conv1d         5.58%     105.873us        99.72%       1.893ms       1.893ms       0.000us         0.00%      14.208us      14.208us             1  
-                                         CausalConv1dFn         4.13%      78.341us        94.14%       1.787ms     595.748us       0.000us         0.00%      14.208us       4.736us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.45%      27.570us        88.49%       1.680ms     559.957us      10.656us       100.00%      14.208us       4.736us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.656us       100.00%      10.656us       3.552us             3  
-                                Activity Buffer Request        77.94%       1.480ms        77.94%       1.480ms       1.480ms       3.552us        33.33%       3.552us       3.552us             1  
-                                       aten::empty_like         0.41%       7.812us         1.53%      29.032us       9.677us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.12%      21.220us         1.12%      21.220us       7.073us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         9.09%     172.624us         9.09%     172.624us      57.541us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.28%       5.330us         0.28%       5.330us       5.330us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     120.383us      1119.53%     120.383us     120.383us             1  
+                               hf_kernels_causal_conv1d         4.38%      80.811us        99.69%       1.838ms       1.838ms       0.000us         0.00%      14.338us      14.338us             1  
+                                         CausalConv1dFn         3.88%      71.502us        95.31%       1.758ms     585.854us       0.000us         0.00%      14.338us       4.779us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.42%      26.240us        89.89%       1.658ms     552.523us      10.753us       100.00%      14.338us       4.779us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.753us       100.00%      10.753us       3.584us             3  
+                                Activity Buffer Request        79.47%       1.465ms        79.47%       1.465ms       1.465ms       3.585us        33.34%       3.585us       3.585us             1  
+                                       aten::empty_like         0.42%       7.711us         1.54%      28.491us       9.497us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.13%      20.780us         1.13%      20.780us       6.927us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         9.00%     165.884us         9.00%     165.884us      55.295us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.31%       5.720us         0.31%       5.720us       5.720us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.898ms
-Self CUDA time total: 10.656us
+Self CPU time total: 1.844ms
+Self CUDA time total: 10.753us
 
 
 
@@ -4127,19 +4135,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     122.524us      1119.66%     122.524us     122.524us             1  
-                               hf_kernels_causal_conv1d        19.00%     100.263us        99.02%     522.563us     522.563us       0.000us         0.00%      14.623us      14.623us             1  
-                                         CausalConv1dFn        14.56%      76.813us        80.02%     422.300us     140.767us       0.000us         0.00%      14.623us       4.874us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.04%      26.621us        60.06%     316.927us     105.642us      10.943us       100.00%      14.623us       4.874us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.943us       100.00%      10.943us       3.648us             3  
-                                Activity Buffer Request        24.63%     129.993us        24.63%     129.993us     129.993us       3.680us        33.63%       3.680us       3.680us             1  
-                                       aten::empty_like         1.53%       8.070us         5.41%      28.560us       9.520us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         3.88%      20.490us         3.88%      20.490us       6.830us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        30.38%     160.313us        30.38%     160.313us      53.438us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.98%       5.160us         0.98%       5.160us       5.160us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     115.901us      1062.24%     115.901us     115.901us             1  
+                               hf_kernels_causal_conv1d        13.49%      81.452us        99.17%     598.664us     598.664us       0.000us         0.00%      14.591us      14.591us             1  
+                                         CausalConv1dFn        11.49%      69.393us        85.68%     517.212us     172.404us       0.000us         0.00%      14.591us       4.864us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.25%      25.660us        69.54%     419.779us     139.926us      10.911us       100.00%      14.591us       4.864us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.911us       100.00%      10.911us       3.637us             3  
+                                Activity Buffer Request        38.07%     229.795us        38.07%     229.795us     229.795us       3.680us        33.73%       3.680us       3.680us             1  
+                                       aten::empty_like         1.23%       7.430us         4.64%      28.040us       9.347us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.41%      20.610us         3.41%      20.610us       6.870us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        27.22%     164.324us        27.22%     164.324us      54.775us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.83%       5.020us         0.83%       5.020us       5.020us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 527.723us
-Self CUDA time total: 10.943us
+Self CPU time total: 603.684us
+Self CUDA time total: 10.911us
 
 
 
@@ -4149,19 +4157,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     130.879us      1185.50%     130.879us     130.879us             1  
-                               hf_kernels_causal_conv1d         6.10%     112.423us        99.71%       1.839ms       1.839ms       0.000us         0.00%      14.752us      14.752us             1  
-                                         CausalConv1dFn         4.42%      81.553us        93.62%       1.726ms     575.457us       0.000us         0.00%      14.752us       4.917us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.34%      24.629us        87.45%       1.613ms     537.533us      11.040us       100.00%      14.752us       4.917us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      11.040us       100.00%      11.040us       3.680us             3  
-                                Activity Buffer Request        77.44%       1.428ms        77.44%       1.428ms       1.428ms       3.712us        33.62%       3.712us       3.712us             1  
-                                       aten::empty_like         0.46%       8.560us         1.75%      32.220us      10.740us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.28%      23.660us         1.28%      23.660us       7.887us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         8.67%     159.915us         8.67%     159.915us      53.305us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.29%       5.260us         0.29%       5.260us       5.260us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     124.031us      1126.74%     124.031us     124.031us             1  
+                               hf_kernels_causal_conv1d         4.38%      80.211us        99.73%       1.825ms       1.825ms       0.000us         0.00%      14.688us      14.688us             1  
+                                         CausalConv1dFn         3.92%      71.693us        95.35%       1.744ms     581.490us       0.000us         0.00%      14.688us       4.896us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.35%      24.770us        89.82%       1.643ms     547.796us      11.008us       100.00%      14.688us       4.896us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      11.008us       100.00%      11.008us       3.669us             3  
+                                Activity Buffer Request        79.44%       1.453ms        79.44%       1.453ms       1.453ms       3.680us        33.43%       3.680us       3.680us             1  
+                                       aten::empty_like         0.44%       8.110us         1.61%      29.390us       9.797us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.16%      21.280us         1.16%      21.280us       7.093us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         9.03%     165.165us         9.03%     165.165us      55.055us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.27%       4.921us         0.27%       4.921us       4.921us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.844ms
-Self CUDA time total: 11.040us
+Self CPU time total: 1.830ms
+Self CUDA time total: 11.008us
 
 
 
@@ -4171,19 +4179,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     124.988us      1097.16%     124.988us     124.988us             1  
-                               hf_kernels_causal_conv1d        14.68%      75.042us        98.95%     505.802us     505.802us       0.000us         0.00%      15.232us      15.232us             1  
-                                         CausalConv1dFn        15.20%      77.712us        84.27%     430.760us     143.587us       0.000us         0.00%      15.232us       5.077us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.71%      24.091us        63.54%     324.777us     108.259us      11.392us       100.00%      15.232us       5.077us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      11.392us       100.00%      11.392us       3.797us             3  
-                                Activity Buffer Request        26.66%     136.263us        26.66%     136.263us     136.263us       3.840us        33.71%       3.840us       3.840us             1  
-                                       aten::empty_like         1.46%       7.441us         5.53%      28.271us       9.424us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.08%      20.830us         4.08%      20.830us       6.943us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        32.17%     164.423us        32.17%     164.423us      54.808us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.05%       5.351us         1.05%       5.351us       5.351us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     122.078us      1080.72%     122.078us     122.078us             1  
+                               hf_kernels_causal_conv1d        13.22%      78.432us        99.12%     587.944us     587.944us       0.000us         0.00%      15.072us      15.072us             1  
+                                         CausalConv1dFn        12.12%      71.922us        85.89%     509.512us     169.837us       0.000us         0.00%      15.072us       5.024us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.25%      25.220us        69.07%     409.719us     136.573us      11.296us       100.00%      15.072us       5.024us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      11.296us       100.00%      11.296us       3.765us             3  
+                                Activity Buffer Request        37.46%     222.215us        37.46%     222.215us     222.215us       3.776us        33.43%       3.776us       3.776us             1  
+                                       aten::empty_like         1.25%       7.430us         4.70%      27.871us       9.290us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.45%      20.441us         3.45%      20.441us       6.814us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        27.36%     162.284us        27.36%     162.284us      54.095us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.88%       5.240us         0.88%       5.240us       5.240us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 511.153us
-Self CUDA time total: 11.392us
+Self CPU time total: 593.184us
+Self CUDA time total: 11.296us
 
 
 
@@ -4193,19 +4201,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     131.775us       262.12%     131.775us     131.775us             1  
-                               hf_kernels_causal_conv1d         8.81%      77.263us        99.39%     871.362us     871.362us       0.000us         0.00%      83.680us      83.680us             1  
-                                         CausalConv1dFn         8.68%      76.121us        90.57%     794.099us     264.700us       0.000us         0.00%      83.680us      27.893us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         3.02%      26.501us        78.58%     688.947us     229.649us      50.272us       100.00%      83.680us      27.893us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      50.272us       100.00%      50.272us      16.757us             3  
-                                Activity Buffer Request        55.77%     488.972us        55.77%     488.972us     488.972us      33.408us        66.45%      33.408us      33.408us             1  
-                                       aten::empty_like         0.92%       8.040us         3.31%      29.031us       9.677us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         2.39%      20.991us         2.39%      20.991us       6.997us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        19.79%     173.474us        19.79%     173.474us      57.825us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.61%       5.370us         0.61%       5.370us       5.370us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     127.648us       252.30%     127.648us     127.648us             1  
+                               hf_kernels_causal_conv1d         4.31%      79.103us        99.73%       1.830ms       1.830ms       0.000us         0.00%      84.257us      84.257us             1  
+                                         CausalConv1dFn         3.94%      72.391us        95.42%       1.751ms     583.740us       0.000us         0.00%      84.257us      28.086us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.47%      26.941us        89.93%       1.650ms     550.139us      50.593us       100.00%      84.257us      28.086us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      50.593us       100.00%      50.593us      16.864us             3  
+                                Activity Buffer Request        79.45%       1.458ms        79.45%       1.458ms       1.458ms      33.664us        66.54%      33.664us      33.664us             1  
+                                       aten::empty_like         0.41%       7.590us         1.55%      28.411us       9.470us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.13%      20.821us         1.13%      20.821us       6.940us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         9.01%     165.403us         9.01%     165.403us      55.134us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.27%       4.880us         0.27%       4.880us       4.880us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 876.732us
-Self CUDA time total: 50.272us
+Self CPU time total: 1.835ms
+Self CUDA time total: 50.593us
 
 
 
@@ -4215,19 +4223,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     127.295us       247.23%     127.295us     127.295us             1  
-                               hf_kernels_causal_conv1d        15.09%      77.332us        99.04%     507.562us     507.562us       0.000us         0.00%      86.016us      86.016us             1  
-                                         CausalConv1dFn        14.68%      75.241us        83.95%     430.230us     143.410us       0.000us         0.00%      86.016us      28.672us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.05%      25.861us        63.40%     324.927us     108.309us      51.488us       100.00%      86.016us      28.672us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      51.488us       100.00%      51.488us      17.163us             3  
-                                Activity Buffer Request        25.26%     129.463us        25.26%     129.463us     129.463us      34.528us        67.06%      34.528us      34.528us             1  
-                                       aten::empty_like         1.67%       8.561us         5.87%      30.062us      10.021us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.20%      21.501us         4.20%      21.501us       7.167us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        33.09%     169.603us        33.09%     169.603us      56.534us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.96%       4.929us         0.96%       4.929us       4.929us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     124.347us       241.52%     124.347us     124.347us             1  
+                               hf_kernels_causal_conv1d        13.88%      78.022us        99.09%     557.033us     557.033us       0.000us         0.00%      85.980us      85.980us             1  
+                                         CausalConv1dFn        12.54%      70.483us        85.21%     479.011us     159.670us       0.000us         0.00%      85.980us      28.660us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.34%      24.401us        67.64%     380.208us     126.736us      51.486us       100.00%      85.980us      28.660us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      51.486us       100.00%      51.486us      17.162us             3  
+                                Activity Buffer Request        34.82%     195.764us        34.82%     195.764us     195.764us      34.494us        67.00%      34.494us      34.494us             1  
+                                       aten::empty_like         1.33%       7.470us         5.04%      28.320us       9.440us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.71%      20.850us         3.71%      20.850us       6.950us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        28.47%     160.043us        28.47%     160.043us      53.348us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.91%       5.110us         0.91%       5.110us       5.110us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 512.491us
-Self CUDA time total: 51.488us
+Self CPU time total: 562.143us
+Self CUDA time total: 51.486us
 
 
 
@@ -4237,19 +4245,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     121.214us      3104.87%     121.214us     121.214us             1  
-                               hf_kernels_causal_conv1d         8.71%      75.123us        99.37%     856.672us     856.672us       0.000us         0.00%       5.184us       5.184us             1  
-                                         CausalConv1dFn         8.55%      73.741us        90.66%     781.549us     260.516us       0.000us         0.00%       5.184us       1.728us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         2.92%      25.150us        78.63%     677.857us     225.952us       3.904us       100.00%       5.184us       1.728us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.904us       100.00%       3.904us       1.301us             3  
-                                Activity Buffer Request        56.24%     484.832us        56.24%     484.832us     484.832us       1.280us        32.79%       1.280us       1.280us             1  
-                                       aten::empty_like         1.08%       9.311us         3.47%      29.951us       9.984us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         2.39%      20.640us         2.39%      20.640us       6.880us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        19.47%     167.875us        19.47%     167.875us      55.958us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.63%       5.440us         0.63%       5.440us       5.440us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     121.728us      3142.18%     121.728us     121.728us             1  
+                               hf_kernels_causal_conv1d         4.20%      76.603us        99.72%       1.818ms       1.818ms       0.000us         0.00%       5.123us       5.123us             1  
+                                         CausalConv1dFn         3.96%      72.231us        95.52%       1.742ms     580.506us       0.000us         0.00%       5.123us       1.708us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.49%      27.119us        89.93%       1.640ms     546.545us       3.874us       100.00%       5.123us       1.708us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.874us       100.00%       3.874us       1.291us             3  
+                                Activity Buffer Request        79.71%       1.453ms        79.71%       1.453ms       1.453ms       1.249us        32.24%       1.249us       1.249us             1  
+                                       aten::empty_like         0.42%       7.681us         1.63%      29.652us       9.884us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.21%      21.971us         1.21%      21.971us       7.324us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         8.74%     159.334us         8.74%     159.334us      53.111us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.28%       5.020us         0.28%       5.020us       5.020us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 862.112us
-Self CUDA time total: 3.904us
+Self CPU time total: 1.823ms
+Self CUDA time total: 3.874us
 
 
 
@@ -4259,19 +4267,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     121.438us      3086.10%     121.438us     121.438us             1  
-                               hf_kernels_causal_conv1d        15.37%      74.422us        98.89%     478.921us     478.921us       0.000us         0.00%       5.183us       5.183us             1  
-                                         CausalConv1dFn        15.69%      75.972us        83.52%     404.499us     134.833us       0.000us         0.00%       5.183us       1.728us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.44%      26.330us        61.72%     298.936us      99.645us       3.935us       100.00%       5.183us       1.728us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.935us       100.00%       3.935us       1.312us             3  
-                                Activity Buffer Request        23.74%     114.963us        23.74%     114.963us     114.963us       1.248us        31.72%       1.248us       1.248us             1  
-                                       aten::empty_like         1.57%       7.609us         6.11%      29.591us       9.864us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.54%      21.982us         4.54%      21.982us       7.327us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        32.55%     157.643us        32.55%     157.643us      52.548us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.11%       5.391us         1.11%       5.391us       5.391us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     112.862us      2867.43%     112.862us     112.862us             1  
+                               hf_kernels_causal_conv1d        13.92%      73.542us        98.94%     522.552us     522.552us       0.000us         0.00%       5.216us       5.216us             1  
+                                         CausalConv1dFn        13.17%      69.571us        85.02%     449.010us     149.670us       0.000us         0.00%       5.216us       1.739us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.04%      26.641us        66.59%     351.668us     117.223us       3.936us       100.00%       5.216us       1.739us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.936us       100.00%       3.936us       1.312us             3  
+                                Activity Buffer Request        31.20%     164.773us        31.20%     164.773us     164.773us       1.280us        32.52%       1.280us       1.280us             1  
+                                       aten::empty_like         1.39%       7.351us         5.26%      27.771us       9.257us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.87%      20.420us         3.87%      20.420us       6.807us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        30.34%     160.254us        30.34%     160.254us      53.418us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.06%       5.590us         1.06%       5.590us       5.590us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 484.312us
-Self CUDA time total: 3.935us
+Self CPU time total: 528.142us
+Self CUDA time total: 3.936us
 
 
 
@@ -4281,19 +4289,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     152.157us      3744.94%     152.157us     152.157us             1  
-                               hf_kernels_causal_conv1d        10.88%      77.931us        99.21%     710.327us     710.327us       0.000us         0.00%       5.407us       5.407us             1  
-                                         CausalConv1dFn        11.39%      81.522us        88.32%     632.396us     210.799us       0.000us         0.00%       5.407us       1.802us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         3.86%      27.639us        72.73%     520.742us     173.581us       4.063us       100.00%       5.407us       1.802us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.063us       100.00%       4.063us       1.354us             3  
-                                Activity Buffer Request        44.05%     315.408us        44.05%     315.408us     315.408us       1.344us        33.08%       1.344us       1.344us             1  
-                                       aten::empty_like         1.15%       8.200us         4.21%      30.132us      10.044us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         3.06%      21.932us         3.06%      21.932us       7.311us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        24.82%     177.695us        24.82%     177.695us      59.232us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.79%       5.681us         0.79%       5.681us       5.681us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     117.630us      2893.73%     117.630us     117.630us             1  
+                               hf_kernels_causal_conv1d         4.22%      76.492us        99.73%       1.809ms       1.809ms       0.000us         0.00%       5.441us       5.441us             1  
+                                         CausalConv1dFn         3.89%      70.602us        95.52%       1.732ms     577.480us       0.000us         0.00%       5.441us       1.814us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.32%      23.990us        90.04%       1.633ms     544.346us       4.065us       100.00%       5.441us       1.814us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.065us       100.00%       4.065us       1.355us             3  
+                                Activity Buffer Request        79.17%       1.436ms        79.17%       1.436ms       1.436ms       1.376us        33.85%       1.376us       1.376us             1  
+                                       aten::empty_like         0.43%       7.870us         1.59%      28.801us       9.600us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.15%      20.931us         1.15%      20.931us       6.977us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         9.54%     173.024us         9.54%     173.024us      57.675us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.27%       4.840us         0.27%       4.840us       4.840us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 716.008us
-Self CUDA time total: 4.063us
+Self CPU time total: 1.814ms
+Self CUDA time total: 4.065us
 
 
 
@@ -4303,19 +4311,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     119.936us      2951.18%     119.936us     119.936us             1  
-                               hf_kernels_causal_conv1d        15.86%      75.552us        99.00%     471.672us     471.672us       0.000us         0.00%       5.440us       5.440us             1  
-                                         CausalConv1dFn        16.03%      76.383us        83.14%     396.120us     132.040us       0.000us         0.00%       5.440us       1.813us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.35%      25.480us        61.26%     291.866us      97.289us       4.064us       100.00%       5.440us       1.813us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.064us       100.00%       4.064us       1.355us             3  
-                                Activity Buffer Request        23.14%     110.243us        23.14%     110.243us     110.243us       1.376us        33.86%       1.376us       1.376us             1  
-                                       aten::empty_like         1.53%       7.269us         5.85%      27.871us       9.290us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.32%      20.602us         4.32%      20.602us       6.867us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        32.77%     156.143us        32.77%     156.143us      52.048us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.00%       4.760us         1.00%       4.760us       4.760us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     112.957us      2780.14%     112.957us     112.957us             1  
+                               hf_kernels_causal_conv1d        13.63%      77.442us        99.02%     562.553us     562.553us       0.000us         0.00%       5.439us       5.439us             1  
+                                         CausalConv1dFn        12.09%      68.663us        85.39%     485.111us     161.704us       0.000us         0.00%       5.439us       1.813us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.90%      27.850us        68.41%     388.648us     129.549us       4.063us       100.00%       5.439us       1.813us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.063us       100.00%       4.063us       1.354us             3  
+                                Activity Buffer Request        32.06%     182.124us        32.06%     182.124us     182.124us       1.376us        33.87%       1.376us       1.376us             1  
+                                       aten::empty_like         1.28%       7.270us         4.89%      27.800us       9.267us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.61%      20.530us         3.61%      20.530us       6.843us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        31.45%     178.674us        31.45%     178.674us      59.558us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.98%       5.590us         0.98%       5.590us       5.590us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 476.432us
-Self CUDA time total: 4.064us
+Self CPU time total: 568.143us
+Self CUDA time total: 4.063us
 
 
 
@@ -4325,19 +4333,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     129.888us      2401.78%     129.888us     129.888us             1  
-                               hf_kernels_causal_conv1d        13.50%     106.873us        99.32%     785.980us     785.980us       0.000us         0.00%       7.264us       7.264us             1  
-                                         CausalConv1dFn        10.04%      79.422us        85.81%     679.107us     226.369us       0.000us         0.00%       7.264us       2.421us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         3.32%      26.310us        72.10%     570.564us     190.188us       5.408us       100.00%       7.264us       2.421us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       5.408us       100.00%       5.408us       1.803us             3  
-                                Activity Buffer Request        48.81%     386.260us        48.81%     386.260us     386.260us       1.856us        34.32%       1.856us       1.856us             1  
-                                       aten::empty_like         1.01%       7.981us         3.68%      29.121us       9.707us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         2.67%      21.140us         2.67%      21.140us       7.047us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        19.96%     157.994us        19.96%     157.994us      52.665us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.68%       5.410us         0.68%       5.410us       5.410us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     118.623us      2193.88%     118.623us     118.623us             1  
+                               hf_kernels_causal_conv1d         4.12%      74.582us        99.72%       1.807ms       1.807ms       0.000us         0.00%       7.231us       7.231us             1  
+                                         CausalConv1dFn         3.94%      71.361us        95.60%       1.732ms     577.313us       0.000us         0.00%       7.231us       2.410us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.39%      25.271us        90.02%       1.631ms     543.639us       5.407us       100.00%       7.231us       2.410us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       5.407us       100.00%       5.407us       1.802us             3  
+                                Activity Buffer Request        79.36%       1.438ms        79.36%       1.438ms       1.438ms       1.824us        33.73%       1.824us       1.824us             1  
+                                       aten::empty_like         0.43%       7.860us         1.64%      29.661us       9.887us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.20%      21.801us         1.20%      21.801us       7.267us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         9.27%     167.954us         9.27%     167.954us      55.985us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.28%       5.140us         0.28%       5.140us       5.140us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 791.390us
-Self CUDA time total: 5.408us
+Self CPU time total: 1.812ms
+Self CUDA time total: 5.407us
 
 
 
@@ -4347,19 +4355,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     118.463us      2151.92%     118.463us     118.463us             1  
-                               hf_kernels_causal_conv1d        19.47%      96.181us        98.96%     488.812us     488.812us       0.000us         0.00%       7.393us       7.393us             1  
-                                         CausalConv1dFn        15.19%      75.044us        79.49%     392.631us     130.877us       0.000us         0.00%       7.393us       2.464us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.31%      26.241us        58.39%     288.397us      96.132us       5.505us       100.00%       7.393us       2.464us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       5.505us       100.00%       5.505us       1.835us             3  
-                                Activity Buffer Request        21.50%     106.222us        21.50%     106.222us     106.222us       1.888us        34.30%       1.888us       1.888us             1  
-                                       aten::empty_like         1.50%       7.390us         5.91%      29.190us       9.730us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.41%      21.800us         4.41%      21.800us       7.267us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        31.57%     155.934us        31.57%     155.934us      51.978us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.04%       5.140us         1.04%       5.140us       5.140us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     112.607us      2057.50%     112.607us     112.607us             1  
+                               hf_kernels_causal_conv1d        13.70%      73.872us        99.01%     534.033us     534.033us       0.000us         0.00%       7.361us       7.361us             1  
+                                         CausalConv1dFn        13.12%      70.792us        85.31%     460.161us     153.387us       0.000us         0.00%       7.361us       2.454us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.78%      25.770us        67.08%     361.838us     120.613us       5.473us       100.00%       7.361us       2.454us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       5.473us       100.00%       5.473us       1.824us             3  
+                                Activity Buffer Request        31.74%     171.214us        31.74%     171.214us     171.214us       1.888us        34.50%       1.888us       1.888us             1  
+                                       aten::empty_like         1.37%       7.381us         5.10%      27.531us       9.177us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.74%      20.150us         3.74%      20.150us       6.717us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        30.56%     164.854us        30.56%     164.854us      54.951us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.99%       5.340us         0.99%       5.340us       5.340us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 493.952us
-Self CUDA time total: 5.505us
+Self CPU time total: 539.373us
+Self CUDA time total: 5.473us
 
 
 
@@ -4369,19 +4377,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     129.279us       741.28%     129.279us     129.279us             1  
-                               hf_kernels_causal_conv1d         5.08%      91.861us        99.73%       1.805ms       1.805ms       0.000us         0.00%      23.296us      23.296us             1  
-                                         CausalConv1dFn         4.24%      76.815us        94.65%       1.713ms     571.078us       0.000us         0.00%      23.296us       7.765us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.42%      25.791us        88.76%       1.607ms     535.516us      17.440us       100.00%      23.296us       7.765us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.440us       100.00%      17.440us       5.813us             3  
-                                Activity Buffer Request        78.65%       1.424ms        78.65%       1.424ms       1.424ms       5.856us        33.58%       5.856us       5.856us             1  
-                                       aten::empty_like         0.47%       8.500us         1.65%      29.870us       9.957us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.18%      21.370us         1.18%      21.370us       7.123us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         8.68%     157.163us         8.68%     157.163us      52.388us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.27%       4.911us         0.27%       4.911us       4.911us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     123.390us       706.22%     123.390us     123.390us             1  
+                               hf_kernels_causal_conv1d         4.18%      75.923us        99.74%       1.812ms       1.812ms       0.000us         0.00%      23.328us      23.328us             1  
+                                         CausalConv1dFn         3.97%      72.132us        95.56%       1.736ms     578.683us       0.000us         0.00%      23.328us       7.776us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.36%      24.789us        89.99%       1.635ms     544.959us      17.472us       100.00%      23.328us       7.776us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.472us       100.00%      17.472us       5.824us             3  
+                                Activity Buffer Request        79.65%       1.447ms        79.65%       1.447ms       1.447ms       5.856us        33.52%       5.856us       5.856us             1  
+                                       aten::empty_like         0.45%       8.169us         1.60%      29.040us       9.680us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.15%      20.871us         1.15%      20.871us       6.957us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         8.97%     163.034us         8.97%     163.034us      54.345us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.26%       4.790us         0.26%       4.790us       4.790us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.810ms
-Self CUDA time total: 17.440us
+Self CPU time total: 1.817ms
+Self CUDA time total: 17.472us
 
 
 
@@ -4391,19 +4399,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     139.324us       772.01%     139.324us     139.324us             1  
-                               hf_kernels_causal_conv1d        18.68%      93.362us        99.02%     494.883us     494.883us       0.000us         0.00%      24.095us      24.095us             1  
-                                         CausalConv1dFn        17.38%      86.843us        80.34%     401.521us     133.840us       0.000us         0.00%      24.095us       8.032us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.36%      26.789us        57.15%     285.628us      95.209us      18.047us       100.00%      24.095us       8.032us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      18.047us       100.00%      18.047us       6.016us             3  
-                                Activity Buffer Request        20.49%     102.403us        20.49%     102.403us     102.403us       6.048us        33.51%       6.048us       6.048us             1  
-                                       aten::empty_like         1.48%       7.399us         5.81%      29.050us       9.683us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.33%      21.651us         4.33%      21.651us       7.217us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        31.30%     156.436us        31.30%     156.436us      52.145us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.98%       4.890us         0.98%       4.890us       4.890us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     121.788us       680.84%     121.788us     121.788us             1  
+                               hf_kernels_causal_conv1d        14.15%      75.583us        99.15%     529.782us     529.782us       0.000us         0.00%      23.904us      23.904us             1  
+                                         CausalConv1dFn        14.61%      78.041us        85.01%     454.199us     151.400us       0.000us         0.00%      23.904us       7.968us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.06%      27.012us        64.90%     346.788us     115.596us      17.888us       100.00%      23.904us       7.968us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.888us       100.00%      17.888us       5.963us             3  
+                                Activity Buffer Request        29.57%     158.003us        29.57%     158.003us     158.003us       6.016us        33.63%       6.016us       6.016us             1  
+                                       aten::empty_like         1.39%       7.440us         5.50%      29.370us       9.790us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.10%      21.930us         4.10%      21.930us       7.310us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        30.28%     161.773us        30.28%     161.773us      53.924us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.85%       4.521us         0.85%       4.521us       4.521us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 499.773us
-Self CUDA time total: 18.047us
+Self CPU time total: 534.303us
+Self CUDA time total: 17.888us
 
 
 
@@ -4413,19 +4421,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     135.103us       748.58%     135.103us     135.103us             1  
-                               hf_kernels_causal_conv1d         5.37%      98.434us        99.69%       1.829ms       1.829ms       0.000us         0.00%      24.097us      24.097us             1  
-                                         CausalConv1dFn         4.35%      79.821us        94.33%       1.730ms     576.697us       0.000us         0.00%      24.097us       8.032us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.36%      24.912us        88.33%       1.620ms     540.010us      18.048us       100.00%      24.097us       8.032us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      18.048us       100.00%      18.048us       6.016us             3  
-                                Activity Buffer Request        77.78%       1.427ms        77.78%       1.427ms       1.427ms       6.049us        33.52%       6.049us       6.049us             1  
-                                       aten::empty_like         0.47%       8.550us         1.65%      30.240us      10.080us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.18%      21.690us         1.18%      21.690us       7.230us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         9.19%     168.514us         9.19%     168.514us      56.171us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.31%       5.620us         0.31%       5.620us       5.620us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     125.087us       696.82%     125.087us     125.087us             1  
+                               hf_kernels_causal_conv1d         4.34%      78.522us        99.74%       1.806ms       1.806ms       0.000us         0.00%      23.998us      23.998us             1  
+                                         CausalConv1dFn         4.03%      72.933us        95.40%       1.728ms     575.883us       0.000us         0.00%      23.998us       7.999us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.38%      25.019us        89.74%       1.625ms     541.689us      17.951us       100.00%      23.998us       7.999us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.951us       100.00%      17.951us       5.984us             3  
+                                Activity Buffer Request        79.32%       1.436ms        79.32%       1.436ms       1.436ms       6.047us        33.69%       6.047us       6.047us             1  
+                                       aten::empty_like         0.46%       8.289us         1.64%      29.650us       9.883us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.18%      21.361us         1.18%      21.361us       7.120us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         9.04%     163.685us         9.04%     163.685us      54.562us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.26%       4.781us         0.26%       4.781us       4.781us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.834ms
-Self CUDA time total: 18.048us
+Self CPU time total: 1.811ms
+Self CUDA time total: 17.951us
 
 
 
@@ -4435,19 +4443,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     130.684us       694.54%     130.684us     130.684us             1  
-                               hf_kernels_causal_conv1d        18.98%      97.223us        99.02%     507.183us     507.183us       0.000us         0.00%      25.120us      25.120us             1  
-                                         CausalConv1dFn        14.58%      74.692us        80.04%     409.960us     136.653us       0.000us         0.00%      25.120us       8.373us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         6.51%      33.321us        59.71%     305.838us     101.946us      18.816us       100.00%      25.120us       8.373us             3  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      18.816us       100.00%      18.816us       6.272us             3  
-                                Activity Buffer Request        22.33%     114.353us        22.33%     114.353us     114.353us       6.304us        33.50%       6.304us       6.304us             1  
-                                       aten::empty_like         1.71%       8.769us         5.75%      29.430us       9.810us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.03%      20.661us         4.03%      20.661us       6.887us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        30.88%     158.164us        30.88%     158.164us      52.721us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.98%       5.010us         0.98%       5.010us       5.010us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     117.887us       630.82%     117.887us     117.887us             1  
+                               hf_kernels_causal_conv1d        11.57%      72.803us        99.15%     623.975us     623.975us       0.000us         0.00%      24.960us      24.960us             1  
+                                         CausalConv1dFn        11.13%      70.072us        87.58%     551.172us     183.724us       0.000us         0.00%      24.960us       8.320us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.22%      26.540us        71.97%     452.920us     150.973us      18.688us       100.00%      24.960us       8.320us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      18.688us       100.00%      18.688us       6.229us             3  
+                                Activity Buffer Request        41.60%     261.806us        41.60%     261.806us     261.806us       6.272us        33.56%       6.272us       6.272us             1  
+                                       aten::empty_like         1.19%       7.500us         4.48%      28.180us       9.393us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.29%      20.680us         3.29%      20.680us       6.893us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        26.15%     164.574us        26.15%     164.574us      54.858us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.85%       5.340us         0.85%       5.340us       5.340us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 512.193us
-Self CUDA time total: 18.816us
+Self CPU time total: 629.315us
+Self CUDA time total: 18.688us
 
 
 
@@ -4457,19 +4465,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         6.14%     112.394us        99.70%       1.825ms       1.825ms       0.000us         0.00%     162.754us     162.754us             1  
-                                         CausalConv1dFn         4.41%      80.651us        93.56%       1.713ms     570.927us       0.000us         0.00%     162.754us      54.251us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.37%      25.010us        87.54%       1.603ms     534.193us      97.985us       100.00%     162.754us      54.251us             3  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     144.737us       147.71%     144.737us     144.737us             1  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      97.985us       100.00%      97.985us      32.662us             3  
-                                Activity Buffer Request        77.36%       1.416ms        77.36%       1.416ms       1.416ms      64.769us        66.10%      64.769us      64.769us             1  
-                                       aten::empty_like         0.49%       8.901us         1.61%      29.551us       9.850us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.13%      20.650us         1.13%      20.650us       6.883us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         8.82%     161.445us         8.82%     161.445us      53.815us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.30%       5.480us         0.30%       5.480us       5.480us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d        11.55%      73.362us        99.20%     630.015us     630.015us       0.000us         0.00%     162.555us     162.555us             1  
+                                         CausalConv1dFn        11.07%      70.302us        87.65%     556.653us     185.551us       0.000us         0.00%     162.555us      54.185us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.16%      26.411us        72.21%     458.550us     152.850us      97.949us       100.00%     162.555us      54.185us             3  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     127.645us       130.32%     127.645us     127.645us             1  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      97.949us       100.00%      97.949us      32.650us             3  
+                                Activity Buffer Request        41.87%     265.926us        41.87%     265.926us     265.926us      64.606us        65.96%      64.606us      64.606us             1  
+                                       aten::empty_like         1.16%       7.350us         4.38%      27.801us       9.267us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.22%      20.451us         3.22%      20.451us       6.817us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        26.17%     166.213us        26.17%     166.213us      55.404us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.80%       5.050us         0.80%       5.050us       5.050us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.831ms
-Self CUDA time total: 97.985us
+Self CPU time total: 635.065us
+Self CUDA time total: 97.949us
 
 
 
@@ -4479,19 +4487,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d        19.17%      96.654us        98.90%     498.573us     498.573us       0.000us         0.00%     163.900us     163.900us             1  
-                                         CausalConv1dFn        15.33%      77.291us        79.73%     401.919us     133.973us       0.000us         0.00%     163.900us      54.633us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.17%      26.053us        58.73%     296.088us      98.696us      98.813us       100.00%     163.900us      54.633us             3  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     133.981us       135.59%     133.981us     133.981us             1  
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      98.813us       100.00%      98.813us      32.938us             3  
-                                Activity Buffer Request        22.39%     112.882us        22.39%     112.882us     112.882us      65.087us        65.87%      65.087us      65.087us             1  
-                                       aten::empty_like         1.55%       7.820us         5.66%      28.540us       9.513us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.11%      20.720us         4.11%      20.720us       6.907us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        31.17%     157.153us        31.17%     157.153us      52.384us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.10%       5.550us         1.10%       5.550us       5.550us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d        11.83%      75.513us        99.19%     633.215us     633.215us       0.000us         0.00%     164.638us     164.638us             1  
+                                         CausalConv1dFn        11.21%      71.532us        87.37%     557.702us     185.901us       0.000us         0.00%     164.638us      54.879us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         3.76%      23.990us        71.75%     458.009us     152.670us      99.103us       100.00%     164.638us      54.879us             3  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     132.254us       133.45%     132.254us     132.254us             1  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      99.103us       100.00%      99.103us      33.034us             3  
+                                Activity Buffer Request        40.13%     256.155us        40.13%     256.155us     256.155us      65.535us        66.13%      65.535us      65.535us             1  
+                                       aten::empty_like         1.16%       7.400us         4.41%      28.161us       9.387us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.25%      20.761us         3.25%      20.761us       6.920us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        27.86%     177.864us        27.86%     177.864us      59.288us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.81%       5.140us         0.81%       5.140us       5.140us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 504.123us
-Self CUDA time total: 98.813us
+Self CPU time total: 638.355us
+Self CUDA time total: 99.103us
 
 
 impl                     wl                  p50(ms)  ok
@@ -4501,12 +4509,12 @@ hf_kernels_causal_conv1d cuda_B2_D2048_S2048_W2     0.05  True
 hf_kernels_causal_conv1d cuda_B2_D2048_S2048_W4     0.05  True
 hf_kernels_causal_conv1d cuda_B2_D2048_S512_W2     0.05  True
 hf_kernels_causal_conv1d cuda_B2_D2048_S512_W4     0.05  True
-hf_kernels_causal_conv1d cuda_B2_D64_S128_W2     0.05  True
-hf_kernels_causal_conv1d cuda_B2_D64_S128_W4     0.06  True
-hf_kernels_causal_conv1d cuda_B2_D64_S2048_W2     0.06  True
+hf_kernels_causal_conv1d cuda_B2_D64_S128_W2     0.04  True
+hf_kernels_causal_conv1d cuda_B2_D64_S128_W4     0.05  True
+hf_kernels_causal_conv1d cuda_B2_D64_S2048_W2     0.05  True
 hf_kernels_causal_conv1d cuda_B2_D64_S2048_W4     0.05  True
-hf_kernels_causal_conv1d cuda_B2_D64_S512_W2     0.06  True
-hf_kernels_causal_conv1d cuda_B2_D64_S512_W4     0.06  True
+hf_kernels_causal_conv1d cuda_B2_D64_S512_W2     0.05  True
+hf_kernels_causal_conv1d cuda_B2_D64_S512_W4     0.05  True
 hf_kernels_causal_conv1d cuda_B4_D2048_S128_W2     0.05  True
 hf_kernels_causal_conv1d cuda_B4_D2048_S128_W4     0.05  True
 hf_kernels_causal_conv1d cuda_B4_D2048_S2048_W2     0.05  True
@@ -4517,18 +4525,19 @@ hf_kernels_causal_conv1d cuda_B4_D64_S128_W2     0.05  True
 hf_kernels_causal_conv1d cuda_B4_D64_S128_W4     0.05  True
 hf_kernels_causal_conv1d cuda_B4_D64_S2048_W2     0.05  True
 hf_kernels_causal_conv1d cuda_B4_D64_S2048_W4     0.05  True
-hf_kernels_causal_conv1d cuda_B4_D64_S512_W2     0.06  True
+hf_kernels_causal_conv1d cuda_B4_D64_S512_W2     0.05  True
 hf_kernels_causal_conv1d cuda_B4_D64_S512_W4     0.05  True
 
▶ UV Install Logs
Fetching 11 files: 0%| | 0/11 [00:00<?, ?it/s] -Fetching 11 files: 64%|██████▎ | 7/11 [00:01<00:01, 3.95it/s] -Fetching 11 files: 100%|██████████| 11/11 [00:01<00:00, 6.21it/s]
+Fetching 11 files: 55%|█████▍ | 6/11 [00:00<00:00, 22.15it/s] +Fetching 11 files: 82%|████████▏ | 9/11 [00:01<00:00, 4.06it/s] +Fetching 11 files: 100%|██████████| 11/11 [00:01<00:00, 5.94it/s]

Artifacts:

causal_conv1d.jsonl diff --git a/causal_conv1d/impls/torch_causal_conv1d.html b/causal_conv1d/impls/torch_causal_conv1d.html index 2dd29f110a68d2d6a2cb36ff92b20f1c54eab64b..483f11ed01491ae48dbf23f627f3689336169003 100644 --- a/causal_conv1d/impls/torch_causal_conv1d.html +++ b/causal_conv1d/impls/torch_causal_conv1d.html @@ -809,6 +809,14 @@ .artifact-preview svg { background: transparent; } + /* Invert SVG images in dark mode */ + :root[data-theme="dark"] .artifact-preview img[src$=".svg"] { + filter: invert(0.9) hue-rotate(180deg); + } + /* Keep SVG images readable in monocolor mode */ + :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] { + filter: none; + } /* CSV table styling */ .artifact-csv { margin-top: 1rem; @@ -3871,7 +3879,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: nv | 0.24s +Cell: nv | 0.21s | Raw @@ -3887,7 +3895,7 @@ Cell: nv | 0.24s
-
Wed Oct 29 14:27:09 2025       
+
Wed Oct 29 15:50:16 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -3896,7 +3904,7 @@ Cell: nv | 0.24s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   33C    P0            109W /  350W |       0MiB /  46068MiB |    100%      Default |
+| N/A   29C    P0             78W /  350W |       0MiB /  46068MiB |     18%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3918,9 +3926,9 @@ Cell: nv | 0.24s
 
 ▼ code 
 ▼ output
- ▶ uv-logs
+ ▶ uv-logs
  | 
-Cell: benchmark | 7.23s
+Cell: benchmark | 3.67s
  | 
 
 Raw
@@ -3982,29 +3990,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     465.824us      2410.10%     465.824us     465.824us             1  
-                                            torch_eager        10.38%     221.098us        99.69%       2.123ms       2.123ms       0.000us         0.00%      21.632us      21.632us             1  
-                                               aten::to         0.54%      11.460us        78.80%       1.678ms     279.633us       0.000us         0.00%      14.304us       2.384us             6  
-                                         aten::_to_copy         2.14%      45.672us        78.26%       1.666ms     277.723us       0.000us         0.00%      14.304us       2.384us             6  
-                                            aten::copy_         2.97%      63.201us        73.51%       1.565ms     260.883us      12.000us        62.09%      14.304us       2.384us             6  
-                                           aten::conv1d         0.45%       9.560us         8.33%     177.314us      59.105us       0.000us         0.00%       7.328us       2.443us             3  
-                                      aten::convolution         0.76%      16.270us         7.88%     167.754us      55.918us       0.000us         0.00%       7.328us       2.443us             3  
-                                     aten::_convolution         1.63%      34.781us         7.11%     151.484us      50.495us       0.000us         0.00%       7.328us       2.443us             3  
-                                aten::_conv_depthwise2d         2.18%      46.460us         4.51%      96.001us      32.000us       7.328us        37.91%       7.328us       2.443us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.328us        37.91%       7.328us       2.443us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.272us        32.45%       6.272us       2.091us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.728us        29.64%       5.728us       1.909us             3  
-                                Activity Buffer Request        67.39%       1.435ms        67.39%       1.435ms       1.435ms       2.304us        11.92%       2.304us       2.304us             1  
-                                    aten::empty_strided         2.60%      55.371us         2.60%      55.371us       9.228us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         4.37%      93.031us         4.37%      93.031us      10.337us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         1.44%      30.589us         1.81%      38.620us       4.291us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.63%      13.371us         0.63%      13.371us       0.891us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.55%      11.811us         0.55%      11.811us       3.937us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.56%      11.940us         0.56%      11.940us       3.980us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.37%       7.972us         0.46%       9.712us       3.237us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     430.557us      2231.44%     430.557us     430.557us             1  
+                                            torch_eager        10.26%     219.304us        99.68%       2.131ms       2.131ms       0.000us         0.00%      21.630us      21.630us             1  
+                                               aten::to         0.52%      11.162us        80.31%       1.717ms     286.188us       0.000us         0.00%      14.269us       2.378us             6  
+                                         aten::_to_copy         1.64%      35.099us        79.78%       1.706ms     284.328us       0.000us         0.00%      14.269us       2.378us             6  
+                                            aten::copy_         2.90%      61.950us        75.45%       1.613ms     268.888us      11.934us        61.85%      14.269us       2.378us             6  
+                                           aten::conv1d         0.34%       7.309us         7.15%     152.793us      50.931us       0.000us         0.00%       7.361us       2.454us             3  
+                                      aten::convolution         0.74%      15.802us         6.80%     145.484us      48.495us       0.000us         0.00%       7.361us       2.454us             3  
+                                     aten::_convolution         1.46%      31.300us         6.06%     129.682us      43.227us       0.000us         0.00%       7.361us       2.454us             3  
+                                aten::_conv_depthwise2d         1.58%      33.771us         3.83%      81.842us      27.281us       7.361us        38.15%       7.361us       2.454us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.361us        38.15%       7.361us       2.454us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.271us        32.50%       6.271us       2.090us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.663us        29.35%       5.663us       1.888us             3  
+                                Activity Buffer Request        69.43%       1.484ms        69.43%       1.484ms       1.484ms       2.335us        12.10%       2.335us       2.335us             1  
+                                    aten::empty_strided         2.69%      57.542us         2.69%      57.542us       9.590us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         4.29%      91.753us         4.29%      91.753us      10.195us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         1.26%      26.879us         1.60%      34.300us       3.811us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.54%      11.581us         0.54%      11.581us       0.772us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.53%      11.370us         0.53%      11.370us       3.790us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.55%      11.861us         0.55%      11.861us       3.954us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.32%       6.790us         0.37%       8.000us       2.667us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.129ms
-Self CUDA time total: 19.328us
+Self CPU time total: 2.138ms
+Self CUDA time total: 19.295us
 
 
 
@@ -4014,29 +4022,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     332.863us      1691.38%     332.863us     332.863us             1  
-                                            torch_eager         6.60%     126.115us        99.71%       1.906ms       1.906ms       0.000us         0.00%      21.792us      21.792us             1  
-                                               aten::to         0.31%       5.930us        85.54%       1.635ms     272.467us       0.000us         0.00%      13.760us       2.293us             6  
-                                         aten::_to_copy         1.30%      24.791us        85.23%       1.629ms     271.478us       0.000us         0.00%      13.760us       2.293us             6  
-                                            aten::copy_         2.71%      51.809us        82.30%       1.573ms     262.158us      11.648us        59.19%      13.760us       2.293us             6  
-                                           aten::conv1d         0.31%       5.929us         6.17%     117.852us      39.284us       0.000us         0.00%       8.032us       2.677us             3  
-                                      aten::convolution         0.53%      10.111us         5.86%     111.923us      37.308us       0.000us         0.00%       8.032us       2.677us             3  
-                                     aten::_convolution         1.20%      22.951us         5.33%     101.812us      33.937us       0.000us         0.00%       8.032us       2.677us             3  
-                                aten::_conv_depthwise2d         1.20%      22.860us         3.35%      64.021us      21.340us       8.032us        40.81%       8.032us       2.677us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       8.032us        40.81%       8.032us       2.677us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.080us        30.89%       6.080us       2.027us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.568us        28.29%       5.568us       1.856us             3  
-                                Activity Buffer Request        77.00%       1.472ms        77.00%       1.472ms       1.472ms       2.112us        10.73%       2.112us       2.112us             1  
-                                    aten::empty_strided         1.63%      31.132us         1.63%      31.132us       5.189us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         3.70%      70.762us         3.70%      70.762us       7.862us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.87%      16.659us         1.16%      22.190us       2.466us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.46%       8.781us         0.46%       8.781us       0.585us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.55%      10.521us         0.55%      10.521us       3.507us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.49%       9.390us         0.49%       9.390us       3.130us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.29%       5.540us         0.35%       6.670us       2.223us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     335.581us      1716.35%     335.581us     335.581us             1  
+                                            torch_eager         7.79%     148.313us        99.72%       1.898ms       1.898ms       0.000us         0.00%      21.664us      21.664us             1  
+                                               aten::to         0.37%       7.132us        84.25%       1.604ms     267.308us       0.000us         0.00%      13.760us       2.293us             6  
+                                         aten::_to_copy         1.28%      24.280us        83.88%       1.597ms     266.119us       0.000us         0.00%      13.760us       2.293us             6  
+                                            aten::copy_         2.64%      50.321us        81.08%       1.543ms     257.239us      11.648us        59.57%      13.760us       2.293us             6  
+                                           aten::conv1d         0.32%       6.130us         6.26%     119.243us      39.748us       0.000us         0.00%       7.904us       2.635us             3  
+                                      aten::convolution         0.58%      11.131us         5.94%     113.113us      37.704us       0.000us         0.00%       7.904us       2.635us             3  
+                                     aten::_convolution         1.18%      22.459us         5.36%     101.982us      33.994us       0.000us         0.00%       7.904us       2.635us             3  
+                                aten::_conv_depthwise2d         1.08%      20.592us         3.32%      63.132us      21.044us       7.904us        40.43%       7.904us       2.635us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.904us        40.43%       7.904us       2.635us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.080us        31.10%       6.080us       2.027us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.568us        28.48%       5.568us       1.856us             3  
+                                Activity Buffer Request        75.89%       1.445ms        75.89%       1.445ms       1.445ms       2.112us        10.80%       2.112us       2.112us             1  
+                                    aten::empty_strided         1.52%      29.001us         1.52%      29.001us       4.834us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         3.69%      70.220us         3.69%      70.220us       7.802us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.92%      17.542us         1.22%      23.251us       2.583us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.48%       9.139us         0.48%       9.139us       0.609us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.58%      11.060us         0.58%      11.060us       3.687us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.51%       9.700us         0.51%       9.700us       3.233us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.31%       5.860us         0.37%       7.020us       2.340us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.911ms
-Self CUDA time total: 19.680us
+Self CPU time total: 1.904ms
+Self CUDA time total: 19.552us
 
 
 
@@ -4046,29 +4054,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     355.199us      1913.89%     355.199us     355.199us             1  
-                                            torch_eager         6.67%     125.171us        99.71%       1.872ms       1.872ms       0.000us         0.00%      20.511us      20.511us             1  
-                                               aten::to         0.32%       6.091us        84.23%       1.581ms     263.570us       0.000us         0.00%      13.600us       2.267us             6  
-                                         aten::_to_copy         1.32%      24.859us        83.90%       1.575ms     262.555us       0.000us         0.00%      13.600us       2.267us             6  
-                                            aten::copy_         2.70%      50.760us        80.88%       1.518ms     253.083us      11.648us        62.76%      13.600us       2.267us             6  
-                                           aten::conv1d         0.30%       5.670us         7.37%     138.423us      46.141us       0.000us         0.00%       6.911us       2.304us             3  
-                                      aten::convolution         0.52%       9.720us         7.07%     132.753us      44.251us       0.000us         0.00%       6.911us       2.304us             3  
-                                     aten::_convolution         1.24%      23.210us         6.55%     123.033us      41.011us       0.000us         0.00%       6.911us       2.304us             3  
-                                aten::_conv_depthwise2d         1.26%      23.712us         4.48%      84.033us      28.011us       6.911us        37.24%       6.911us       2.304us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       6.911us        37.24%       6.911us       2.304us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.984us        32.24%       5.984us       1.995us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.664us        30.52%       5.664us       1.888us             3  
-                                Activity Buffer Request        75.59%       1.419ms        75.59%       1.419ms       1.419ms       1.952us        10.52%       1.952us       1.952us             1  
-                                    aten::empty_strided         1.70%      31.973us         1.70%      31.973us       5.329us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         3.83%      72.002us         3.83%      72.002us       8.000us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.89%      16.661us         1.15%      21.682us       2.409us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.48%       8.941us         0.48%       8.941us       0.596us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.49%      28.041us         1.49%      28.041us       9.347us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.47%       8.840us         0.47%       8.840us       2.947us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.32%       5.960us         0.40%       7.470us       2.490us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     343.036us      1844.97%     343.036us     343.036us             1  
+                                            torch_eager         7.68%     146.161us        99.72%       1.897ms       1.897ms       0.000us         0.00%      20.481us      20.481us             1  
+                                               aten::to         0.37%       6.953us        83.90%       1.596ms     266.066us       0.000us         0.00%      13.536us       2.256us             6  
+                                         aten::_to_copy         1.25%      23.842us        83.53%       1.589ms     264.907us       0.000us         0.00%      13.536us       2.256us             6  
+                                            aten::copy_         2.67%      50.789us        80.65%       1.535ms     255.762us      11.648us        62.65%      13.536us       2.256us             6  
+                                           aten::conv1d         0.33%       6.290us         6.66%     126.782us      42.261us       0.000us         0.00%       6.945us       2.315us             3  
+                                      aten::convolution         0.51%       9.650us         6.33%     120.492us      40.164us       0.000us         0.00%       6.945us       2.315us             3  
+                                     aten::_convolution         1.22%      23.120us         5.83%     110.842us      36.947us       0.000us         0.00%       6.945us       2.315us             3  
+                                aten::_conv_depthwise2d         1.44%      27.490us         3.78%      71.832us      23.944us       6.945us        37.35%       6.945us       2.315us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       6.945us        37.35%       6.945us       2.315us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.952us        32.01%       5.952us       1.984us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.696us        30.64%       5.696us       1.899us             3  
+                                Activity Buffer Request        75.34%       1.433ms        75.34%       1.433ms       1.433ms       1.888us        10.15%       1.888us       1.888us             1  
+                                    aten::empty_strided         1.63%      31.030us         1.63%      31.030us       5.172us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         3.99%      75.911us         3.99%      75.911us       8.435us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.90%      17.191us         1.18%      22.431us       2.492us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.46%       8.831us         0.46%       8.831us       0.589us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.55%      10.511us         0.55%      10.511us       3.504us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.43%       8.261us         0.43%       8.261us       2.754us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.31%       5.930us         0.38%       7.310us       2.437us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.878ms
-Self CUDA time total: 18.559us
+Self CPU time total: 1.903ms
+Self CUDA time total: 18.593us
 
 
 
@@ -4078,29 +4086,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     335.841us      1714.87%     335.841us     335.841us             1  
-                                            torch_eager         6.09%     125.084us        99.75%       2.047ms       2.047ms       0.000us         0.00%      21.728us      21.728us             1  
-                                               aten::to         0.29%       6.012us        86.59%       1.777ms     296.210us       0.000us         0.00%      14.049us       2.341us             6  
-                                         aten::_to_copy         1.18%      24.318us        86.30%       1.771ms     295.209us       0.000us         0.00%      14.049us       2.341us             6  
-                                            aten::copy_         2.44%      50.170us        83.64%       1.717ms     286.105us      11.905us        60.79%      14.049us       2.341us             6  
-                                           aten::conv1d         0.29%       5.981us         5.73%     117.633us      39.211us       0.000us         0.00%       7.679us       2.560us             3  
-                                      aten::convolution         0.48%       9.909us         5.44%     111.652us      37.217us       0.000us         0.00%       7.679us       2.560us             3  
-                                     aten::_convolution         1.11%      22.712us         4.96%     101.743us      33.914us       0.000us         0.00%       7.679us       2.560us             3  
-                                aten::_conv_depthwise2d         1.08%      22.231us         3.11%      63.781us      21.260us       7.679us        39.21%       7.679us       2.560us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.679us        39.21%       7.679us       2.560us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.176us        31.54%       6.176us       2.059us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.729us        29.25%       5.729us       1.910us             3  
-                                Activity Buffer Request        70.17%       1.440ms        70.17%       1.440ms       1.440ms       2.144us        10.95%       2.144us       2.144us             1  
-                                    aten::empty_strided         1.48%      30.301us         1.48%      30.301us       5.050us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        12.02%     246.676us        12.02%     246.676us      27.408us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.85%      17.450us         1.12%      22.930us       2.548us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.44%       8.940us         0.44%       8.940us       0.596us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.47%       9.630us         0.47%       9.630us       3.210us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.56%      11.490us         0.56%      11.490us       3.830us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.28%       5.710us         0.34%       6.930us       2.310us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     329.535us      1679.93%     329.535us     329.535us             1  
+                                            torch_eager         6.84%     137.444us        99.71%       2.005ms       2.005ms       0.000us         0.00%      21.760us      21.760us             1  
+                                               aten::to         0.33%       6.580us        85.73%       1.724ms     287.253us       0.000us         0.00%      14.048us       2.341us             6  
+                                         aten::_to_copy         1.15%      23.069us        85.41%       1.717ms     286.156us       0.000us         0.00%      14.048us       2.341us             6  
+                                            aten::copy_         2.51%      50.362us        82.79%       1.664ms     277.390us      11.904us        60.69%      14.048us       2.341us             6  
+                                           aten::conv1d         0.28%       5.589us         5.76%     115.862us      38.621us       0.000us         0.00%       7.712us       2.571us             3  
+                                      aten::convolution         0.52%      10.381us         5.49%     110.273us      36.758us       0.000us         0.00%       7.712us       2.571us             3  
+                                     aten::_convolution         1.13%      22.651us         4.97%      99.892us      33.297us       0.000us         0.00%       7.712us       2.571us             3  
+                                aten::_conv_depthwise2d         1.01%      20.392us         3.08%      61.981us      20.660us       7.712us        39.31%       7.712us       2.571us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.712us        39.31%       7.712us       2.571us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.176us        31.48%       6.176us       2.059us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.728us        29.20%       5.728us       1.909us             3  
+                                Activity Buffer Request        70.54%       1.418ms        70.54%       1.418ms       1.418ms       2.144us        10.93%       2.144us       2.144us             1  
+                                    aten::empty_strided         1.47%      29.531us         1.47%      29.531us       4.922us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.85%     218.024us        10.85%     218.024us      24.225us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.87%      17.580us         1.13%      22.650us       2.517us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.42%       8.370us         0.42%       8.370us       0.558us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.48%       9.699us         0.48%       9.699us       3.233us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.49%       9.760us         0.49%       9.760us       3.253us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.28%       5.670us         0.34%       6.790us       2.263us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.053ms
-Self CUDA time total: 19.584us
+Self CPU time total: 2.010ms
+Self CUDA time total: 19.616us
 
 
 
@@ -4110,29 +4118,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     329.565us      1339.31%     329.565us     329.565us             1  
-                                            torch_eager         6.13%     122.184us        99.75%       1.990ms       1.990ms       0.000us         0.00%      26.911us      26.911us             1  
-                                               aten::to         0.30%       5.979us        86.40%       1.724ms     287.259us       0.000us         0.00%      15.359us       2.560us             6  
-                                         aten::_to_copy         1.37%      27.300us        86.10%       1.718ms     286.262us       0.000us         0.00%      15.359us       2.560us             6  
-                                            aten::copy_         2.45%      48.801us        83.22%       1.660ms     276.655us      13.055us        53.05%      15.359us       2.560us             6  
-                                           aten::conv1d         0.29%       5.841us         5.86%     116.932us      38.977us       0.000us         0.00%      11.552us       3.851us             3  
-                                      aten::convolution         0.50%       9.929us         5.57%     111.091us      37.030us       0.000us         0.00%      11.552us       3.851us             3  
-                                     aten::_convolution         1.16%      23.192us         5.07%     101.162us      33.721us       0.000us         0.00%      11.552us       3.851us             3  
-                                aten::_conv_depthwise2d         1.12%      22.341us         3.11%      62.030us      20.677us      11.552us        46.95%      11.552us       3.851us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      11.552us        46.95%      11.552us       3.851us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.688us        27.18%       6.688us       2.229us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.367us        25.87%       6.367us       2.122us             3  
-                                Activity Buffer Request        71.71%       1.430ms        71.71%       1.430ms       1.430ms       2.304us         9.36%       2.304us       2.304us             1  
-                                    aten::empty_strided         1.52%      30.342us         1.52%      30.342us       5.057us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        10.06%     200.744us        10.06%     200.744us      22.305us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.86%      17.251us         1.14%      22.681us       2.520us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.45%       9.051us         0.45%       9.051us       0.603us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.48%       9.579us         0.48%       9.579us       3.193us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.50%      10.050us         0.50%      10.050us       3.350us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.30%       6.019us         0.36%       7.270us       2.423us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     344.351us      1401.22%     344.351us     344.351us             1  
+                                            torch_eager         7.42%     151.244us        99.76%       2.034ms       2.034ms       0.000us         0.00%      26.847us      26.847us             1  
+                                               aten::to         0.33%       6.730us        85.23%       1.738ms     289.583us       0.000us         0.00%      15.264us       2.544us             6  
+                                         aten::_to_copy         1.15%      23.491us        84.90%       1.731ms     288.462us       0.000us         0.00%      15.264us       2.544us             6  
+                                            aten::copy_         2.84%      57.871us        82.24%       1.677ms     279.428us      12.992us        52.87%      15.264us       2.544us             6  
+                                           aten::conv1d         0.31%       6.410us         5.76%     117.443us      39.148us       0.000us         0.00%      11.583us       3.861us             3  
+                                      aten::convolution         0.49%      10.031us         5.45%     111.033us      37.011us       0.000us         0.00%      11.583us       3.861us             3  
+                                     aten::_convolution         1.08%      22.081us         4.95%     101.002us      33.667us       0.000us         0.00%      11.583us       3.861us             3  
+                                aten::_conv_depthwise2d         1.04%      21.239us         3.10%      63.201us      21.067us      11.583us        47.13%      11.583us       3.861us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      11.583us        47.13%      11.583us       3.861us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.656us        27.08%       6.656us       2.219us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.336us        25.78%       6.336us       2.112us             3  
+                                Activity Buffer Request        70.08%       1.429ms        70.08%       1.429ms       1.429ms       2.272us         9.25%       2.272us       2.272us             1  
+                                    aten::empty_strided         1.51%      30.710us         1.51%      30.710us       5.118us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.42%     212.467us        10.42%     212.467us      23.607us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.89%      18.130us         1.15%      23.350us       2.594us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.43%       8.790us         0.43%       8.790us       0.586us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.48%       9.800us         0.48%       9.800us       3.267us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.47%       9.640us         0.47%       9.640us       3.213us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.28%       5.760us         0.35%       7.050us       2.350us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.995ms
-Self CUDA time total: 24.607us
+Self CPU time total: 2.039ms
+Self CUDA time total: 24.575us
 
 
 
@@ -4142,29 +4150,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     358.812us      1379.20%     358.812us     358.812us             1  
-                                            torch_eager         6.94%     139.423us        99.75%       2.005ms       2.005ms       0.000us         0.00%      28.256us      28.256us             1  
-                                               aten::to         0.33%       6.550us        85.45%       1.717ms     286.205us       0.000us         0.00%      15.199us       2.533us             6  
-                                         aten::_to_copy         1.20%      24.182us        85.13%       1.711ms     285.114us       0.000us         0.00%      15.199us       2.533us             6  
-                                            aten::copy_         2.59%      52.130us        82.30%       1.654ms     275.648us      12.959us        49.81%      15.199us       2.533us             6  
-                                           aten::conv1d         0.30%       6.120us         5.97%     119.993us      39.998us       0.000us         0.00%      13.057us       4.352us             3  
-                                      aten::convolution         0.48%       9.660us         5.67%     113.873us      37.958us       0.000us         0.00%      13.057us       4.352us             3  
-                                     aten::_convolution         1.13%      22.802us         5.19%     104.213us      34.738us       0.000us         0.00%      13.057us       4.352us             3  
-                                aten::_conv_depthwise2d         1.09%      21.932us         3.25%      65.242us      21.747us      13.057us        50.19%      13.057us       4.352us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      13.057us        50.19%      13.057us       4.352us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.623us        25.46%       6.623us       2.208us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.336us        24.35%       6.336us       2.112us             3  
-                                Activity Buffer Request        70.68%       1.420ms        70.68%       1.420ms       1.420ms       2.240us         8.61%       2.240us       2.240us             1  
-                                    aten::empty_strided         1.62%      32.611us         1.62%      32.611us       5.435us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        10.17%     204.364us        10.17%     204.364us      22.707us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.88%      17.647us         1.15%      23.189us       2.577us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.47%       9.382us         0.47%       9.382us       0.625us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.58%      11.651us         0.58%      11.651us       3.884us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.44%       8.769us         0.44%       8.769us       2.923us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.32%       6.420us         0.39%       7.890us       2.630us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     332.990us      1278.37%     332.990us     332.990us             1  
+                                            torch_eager         7.09%     142.971us        99.76%       2.011ms       2.011ms       0.000us         0.00%      28.288us      28.288us             1  
+                                               aten::to         0.35%       7.062us        85.44%       1.723ms     287.120us       0.000us         0.00%      15.232us       2.539us             6  
+                                         aten::_to_copy         1.18%      23.771us        85.09%       1.716ms     285.943us       0.000us         0.00%      15.232us       2.539us             6  
+                                            aten::copy_         2.51%      50.519us        82.47%       1.663ms     277.136us      12.992us        49.88%      15.232us       2.539us             6  
+                                           aten::conv1d         0.32%       6.541us         5.84%     117.833us      39.278us       0.000us         0.00%      13.056us       4.352us             3  
+                                      aten::convolution         0.52%      10.410us         5.52%     111.292us      37.097us       0.000us         0.00%      13.056us       4.352us             3  
+                                     aten::_convolution         1.19%      24.049us         5.00%     100.882us      33.627us       0.000us         0.00%      13.056us       4.352us             3  
+                                aten::_conv_depthwise2d         1.01%      20.460us         2.98%      60.052us      20.017us      13.056us        50.12%      13.056us       4.352us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      13.056us        50.12%      13.056us       4.352us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.624us        25.43%       6.624us       2.208us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.368us        24.45%       6.368us       2.123us             3  
+                                Activity Buffer Request        70.71%       1.426ms        70.71%       1.426ms       1.426ms       2.240us         8.60%       2.240us       2.240us             1  
+                                    aten::empty_strided         1.44%      29.071us         1.44%      29.071us       4.845us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.31%     207.805us        10.31%     207.805us      23.089us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.90%      18.081us         1.15%      23.201us       2.578us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.43%       8.650us         0.43%       8.650us       0.577us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.49%       9.891us         0.49%       9.891us       3.297us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.42%       8.561us         0.42%       8.561us       2.854us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.31%       6.350us         0.38%       7.610us       2.537us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.010ms
-Self CUDA time total: 26.016us
+Self CPU time total: 2.016ms
+Self CUDA time total: 26.048us
 
 
 
@@ -4174,29 +4182,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     328.896us       853.65%     328.896us     328.896us             1  
-                                            torch_eager         6.29%     121.493us        99.73%       1.928ms       1.928ms       0.000us         0.00%      41.088us      41.088us             1  
-                                           aten::conv1d         0.31%       5.961us         6.00%     115.903us      38.634us       0.000us         0.00%      22.688us       7.563us             3  
-                                      aten::convolution         0.50%       9.600us         5.69%     109.942us      36.647us       0.000us         0.00%      22.688us       7.563us             3  
-                                     aten::_convolution         1.16%      22.510us         5.19%     100.342us      33.447us       0.000us         0.00%      22.688us       7.563us             3  
-                                aten::_conv_depthwise2d         1.17%      22.551us         3.25%      62.881us      20.960us      22.688us        58.89%      22.688us       7.563us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      22.688us        58.89%      22.688us       7.563us             3  
-                                               aten::to         0.33%       6.421us        86.08%       1.664ms     277.308us       0.000us         0.00%      18.400us       3.067us             6  
-                                         aten::_to_copy         1.25%      24.161us        85.75%       1.657ms     276.238us       0.000us         0.00%      18.400us       3.067us             6  
-                                            aten::copy_         2.57%      49.759us        82.93%       1.603ms     267.166us      15.840us        41.11%      18.400us       3.067us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.448us        21.93%       8.448us       2.816us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.392us        19.19%       7.392us       2.464us             3  
-                                Activity Buffer Request        71.07%       1.374ms        71.07%       1.374ms       1.374ms       2.560us         6.64%       2.560us       2.560us             1  
-                                    aten::empty_strided         1.57%      30.271us         1.57%      30.271us       5.045us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        10.43%     201.525us        10.43%     201.525us      22.392us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.86%      16.701us         1.14%      22.001us       2.445us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.45%       8.751us         0.45%       8.751us       0.583us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.48%       9.290us         0.48%       9.290us       3.097us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.47%       9.060us         0.47%       9.060us       3.020us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.28%       5.459us         0.35%       6.690us       2.230us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     332.220us       868.07%     332.220us     332.220us             1  
+                                            torch_eager         7.10%     144.065us        99.76%       2.024ms       2.024ms       0.000us         0.00%      40.831us      40.831us             1  
+                                           aten::conv1d         0.30%       6.030us         5.72%     116.102us      38.701us       0.000us         0.00%      22.464us       7.488us             3  
+                                      aten::convolution         0.49%       9.861us         5.42%     110.072us      36.691us       0.000us         0.00%      22.464us       7.488us             3  
+                                     aten::_convolution         1.11%      22.459us         4.94%     100.211us      33.404us       0.000us         0.00%      22.464us       7.488us             3  
+                                aten::_conv_depthwise2d         1.00%      20.252us         3.07%      62.362us      20.787us      22.464us        58.70%      22.464us       7.488us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      22.464us        58.70%      22.464us       7.488us             3  
+                                               aten::to         0.31%       6.271us        85.57%       1.737ms     289.428us       0.000us         0.00%      18.367us       3.061us             6  
+                                         aten::_to_copy         1.14%      23.180us        85.26%       1.730ms     288.383us       0.000us         0.00%      18.367us       3.061us             6  
+                                            aten::copy_         2.42%      49.061us        82.56%       1.675ms     279.226us      15.807us        41.30%      18.367us       3.061us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.448us        22.07%       8.448us       2.816us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.359us        19.23%       7.359us       2.453us             3  
+                                Activity Buffer Request        70.88%       1.438ms        70.88%       1.438ms       1.438ms       2.560us         6.69%       2.560us       2.560us             1  
+                                    aten::empty_strided         1.57%      31.760us         1.57%      31.760us       5.293us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.30%     209.084us        10.30%     209.084us      23.232us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.88%      17.889us         1.13%      22.980us       2.553us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.43%       8.691us         0.43%       8.691us       0.579us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.58%      11.680us         0.58%      11.680us       3.893us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.45%       9.220us         0.45%       9.220us       3.073us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.29%       5.850us         0.36%       7.240us       2.413us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.933ms
-Self CUDA time total: 38.528us
+Self CPU time total: 2.029ms
+Self CUDA time total: 38.271us
 
 
 
@@ -4206,29 +4214,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     334.458us       810.83%     334.458us     334.458us             1  
-                                            torch_eager         6.32%     125.394us        99.75%       1.978ms       1.978ms       0.000us         0.00%      43.841us      43.841us             1  
-                                           aten::conv1d         0.30%       5.899us         5.88%     116.562us      38.854us       0.000us         0.00%      25.600us       8.533us             3  
-                                      aten::convolution         0.49%       9.810us         5.58%     110.663us      36.888us       0.000us         0.00%      25.600us       8.533us             3  
-                                     aten::_convolution         1.13%      22.411us         5.09%     100.853us      33.618us       0.000us         0.00%      25.600us       8.533us             3  
-                                aten::_conv_depthwise2d         1.14%      22.520us         3.20%      63.392us      21.131us      25.600us        62.06%      25.600us       8.533us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      25.600us        62.06%      25.600us       8.533us             3  
-                                               aten::to         0.30%       5.959us        86.14%       1.708ms     284.675us       0.000us         0.00%      18.241us       3.040us             6  
-                                         aten::_to_copy         1.33%      26.372us        85.84%       1.702ms     283.682us       0.000us         0.00%      18.241us       3.040us             6  
-                                            aten::copy_         2.49%      49.420us        83.02%       1.646ms     274.363us      15.649us        37.94%      18.241us       3.040us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.321us        20.17%       8.321us       2.774us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.328us        17.77%       7.328us       2.443us             3  
-                                Activity Buffer Request        71.51%       1.418ms        71.51%       1.418ms       1.418ms       2.592us         6.28%       2.592us       2.592us             1  
-                                    aten::empty_strided         1.49%      29.540us         1.49%      29.540us       4.923us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        10.06%     199.427us        10.06%     199.427us      22.159us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.92%      18.199us         1.18%      23.330us       2.592us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.44%       8.651us         0.44%       8.651us       0.577us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.54%      10.640us         0.54%      10.640us       3.547us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.48%       9.610us         0.48%       9.610us       3.203us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.28%       5.590us         0.34%       6.770us       2.257us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     347.901us       847.38%     347.901us     347.901us             1  
+                                            torch_eager         7.21%     147.111us        99.76%       2.035ms       2.035ms       0.000us         0.00%      43.616us      43.616us             1  
+                                           aten::conv1d         0.33%       6.680us         5.94%     121.133us      40.378us       0.000us         0.00%      25.376us       8.459us             3  
+                                      aten::convolution         0.49%      10.011us         5.61%     114.453us      38.151us       0.000us         0.00%      25.376us       8.459us             3  
+                                     aten::_convolution         1.21%      24.739us         5.12%     104.442us      34.814us       0.000us         0.00%      25.376us       8.459us             3  
+                                aten::_conv_depthwise2d         1.05%      21.431us         3.09%      62.981us      20.994us      25.376us        61.81%      25.376us       8.459us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      25.376us        61.81%      25.376us       8.459us             3  
+                                               aten::to         0.35%       7.210us        85.28%       1.740ms     289.922us       0.000us         0.00%      18.240us       3.040us             6  
+                                         aten::_to_copy         1.21%      24.639us        84.93%       1.732ms     288.720us       0.000us         0.00%      18.240us       3.040us             6  
+                                            aten::copy_         2.56%      52.303us        82.23%       1.677ms     279.542us      15.680us        38.19%      18.240us       3.040us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.288us        20.19%       8.288us       2.763us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.392us        18.00%       7.392us       2.464us             3  
+                                Activity Buffer Request        70.32%       1.434ms        70.32%       1.434ms       1.434ms       2.560us         6.24%       2.560us       2.560us             1  
+                                    aten::empty_strided         1.49%      30.432us         1.49%      30.432us       5.072us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.49%     213.884us        10.49%     213.884us      23.765us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.88%      17.871us         1.14%      23.242us       2.582us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.44%       8.942us         0.44%       8.942us       0.596us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.48%       9.840us         0.48%       9.840us       3.280us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.42%       8.540us         0.42%       8.540us       2.847us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.31%       6.230us         0.37%       7.540us       2.513us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.983ms
-Self CUDA time total: 41.249us
+Self CPU time total: 2.040ms
+Self CUDA time total: 41.056us
 
 
 
@@ -4238,29 +4246,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     338.849us       326.92%     338.849us     338.849us             1  
-                                            torch_eager         5.95%     117.585us        99.74%       1.970ms       1.970ms       0.000us         0.00%     109.697us     109.697us             1  
-                                           aten::conv1d         0.30%       5.970us         6.05%     119.502us      39.834us       0.000us         0.00%      71.232us      23.744us             3  
-                                      aten::convolution         0.49%       9.700us         5.75%     113.532us      37.844us       0.000us         0.00%      71.232us      23.744us             3  
-                                     aten::_convolution         1.15%      22.781us         5.26%     103.832us      34.611us       0.000us         0.00%      71.232us      23.744us             3  
-                                aten::_conv_depthwise2d         1.18%      23.259us         3.31%      65.420us      21.807us      71.232us        68.72%      71.232us      23.744us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      71.232us        68.72%      71.232us      23.744us             3  
-                                               aten::to         0.31%       6.199us        86.38%       1.706ms     284.313us       0.000us         0.00%      38.465us       6.411us             6  
-                                         aten::_to_copy         1.31%      25.891us        86.06%       1.700ms     283.280us       0.000us         0.00%      38.465us       6.411us             6  
-                                            aten::copy_         2.57%      50.812us        83.17%       1.643ms     273.758us      32.417us        31.28%      38.465us       6.411us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      17.760us        17.13%      17.760us       5.920us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      14.657us        14.14%      14.657us       4.886us             3  
-                                Activity Buffer Request        71.61%       1.414ms        71.61%       1.414ms       1.414ms       6.048us         5.84%       6.048us       6.048us             1  
-                                    aten::empty_strided         1.58%      31.240us         1.58%      31.240us       5.207us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        10.13%     200.155us        10.13%     200.155us      22.239us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.87%      17.181us         1.15%      22.621us       2.513us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.45%       8.941us         0.45%       8.941us       0.596us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.51%      10.050us         0.51%      10.050us       3.350us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.47%       9.370us         0.47%       9.370us       3.123us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.28%       5.551us         0.35%       6.851us       2.284us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     334.331us       325.39%     334.331us     334.331us             1  
+                                            torch_eager         7.01%     141.713us        99.76%       2.018ms       2.018ms       0.000us         0.00%     108.764us     108.764us             1  
+                                           aten::conv1d         0.30%       6.090us         5.77%     116.623us      38.874us       0.000us         0.00%      70.528us      23.509us             3  
+                                      aten::convolution         0.56%      11.281us         5.46%     110.533us      36.844us       0.000us         0.00%      70.528us      23.509us             3  
+                                     aten::_convolution         1.11%      22.501us         4.91%      99.252us      33.084us       0.000us         0.00%      70.528us      23.509us             3  
+                                aten::_conv_depthwise2d         1.02%      20.538us         3.03%      61.301us      20.434us      70.528us        68.64%      70.528us      23.509us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      70.528us        68.64%      70.528us      23.509us             3  
+                                               aten::to         0.31%       6.229us        85.56%       1.731ms     288.457us       0.000us         0.00%      38.236us       6.373us             6  
+                                         aten::_to_copy         1.17%      23.650us        85.25%       1.725ms     287.419us       0.000us         0.00%      38.236us       6.373us             6  
+                                            aten::copy_         2.48%      50.230us        82.63%       1.672ms     278.605us      32.221us        31.36%      38.236us       6.373us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      17.598us        17.13%      17.598us       5.866us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      14.623us        14.23%      14.623us       4.874us             3  
+                                Activity Buffer Request        70.91%       1.435ms        70.91%       1.435ms       1.435ms       6.015us         5.85%       6.015us       6.015us             1  
+                                    aten::empty_strided         1.45%      29.232us         1.45%      29.232us       4.872us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.36%     209.517us        10.36%     209.517us      23.280us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.88%      17.770us         1.13%      22.940us       2.549us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.42%       8.560us         0.42%       8.560us       0.571us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.48%       9.771us         0.48%       9.771us       3.257us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.41%       8.351us         0.41%       8.351us       2.784us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.28%       5.590us         0.33%       6.770us       2.257us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.975ms
-Self CUDA time total: 103.649us
+Self CPU time total: 2.023ms
+Self CUDA time total: 102.749us
 
 
 
@@ -4270,29 +4278,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     357.597us       314.53%     357.597us     357.597us             1  
-                                            torch_eager         6.01%     120.196us        99.73%       1.995ms       1.995ms       0.000us         0.00%     119.645us     119.645us             1  
-                                           aten::conv1d         0.28%       5.578us         6.85%     137.112us      45.704us       0.000us         0.00%      81.344us      27.115us             3  
-                                      aten::convolution         0.47%       9.452us         6.58%     131.534us      43.845us       0.000us         0.00%      81.344us      27.115us             3  
-                                     aten::_convolution         1.16%      23.298us         6.10%     122.082us      40.694us       0.000us         0.00%      81.344us      27.115us             3  
-                                aten::_conv_depthwise2d         1.16%      23.221us         4.15%      82.932us      27.644us      81.344us        71.55%      81.344us      27.115us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      81.344us        71.55%      81.344us      27.115us             3  
-                                               aten::to         0.33%       6.509us        85.46%       1.710ms     284.935us       0.000us         0.00%      38.301us       6.383us             6  
-                                         aten::_to_copy         1.29%      25.870us        85.14%       1.703ms     283.850us       0.000us         0.00%      38.301us       6.383us             6  
-                                            aten::copy_         2.58%      51.531us        82.27%       1.646ms     274.308us      32.350us        28.45%      38.301us       6.383us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      17.727us        15.59%      17.727us       5.909us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      14.623us        12.86%      14.623us       4.874us             3  
-                                Activity Buffer Request        70.95%       1.419ms        70.95%       1.419ms       1.419ms       5.951us         5.23%       5.951us       5.951us             1  
-                                    aten::empty_strided         1.57%      31.380us         1.57%      31.380us       5.230us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         9.95%     199.044us         9.95%     199.044us      22.116us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.89%      17.740us         1.16%      23.191us       2.577us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.47%       9.433us         0.47%       9.433us       0.629us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.53%      10.531us         0.53%      10.531us       3.510us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.26%      25.130us         1.26%      25.130us       8.377us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.30%       6.010us         0.38%       7.612us       2.537us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     330.969us       293.42%     330.969us     330.969us             1  
+                                            torch_eager        14.99%     119.634us        99.39%     793.059us     793.059us       0.000us         0.00%     118.814us     118.814us             1  
+                                           aten::conv1d         0.68%       5.459us        14.66%     116.982us      38.994us       0.000us         0.00%      80.510us      26.837us             3  
+                                      aten::convolution         1.26%      10.041us        13.98%     111.523us      37.174us       0.000us         0.00%      80.510us      26.837us             3  
+                                     aten::_convolution         2.84%      22.661us        12.72%     101.482us      33.827us       0.000us         0.00%      80.510us      26.837us             3  
+                                aten::_conv_depthwise2d         2.60%      20.719us         7.95%      63.401us      21.134us      80.510us        71.38%      80.510us      26.837us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      80.510us        71.38%      80.510us      26.837us             3  
+                                               aten::to         0.74%       5.920us        66.51%     530.742us      88.457us       0.000us         0.00%      38.304us       6.384us             6  
+                                         aten::_to_copy         2.94%      23.422us        65.77%     524.822us      87.470us       0.000us         0.00%      38.304us       6.384us             6  
+                                            aten::copy_         6.43%      51.340us        58.99%     470.681us      78.447us      32.288us        28.62%      38.304us       6.384us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      17.696us        15.69%      17.696us       5.899us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      14.592us        12.94%      14.592us       4.864us             3  
+                                Activity Buffer Request        29.02%     231.576us        29.02%     231.576us     231.576us       6.016us         5.33%       6.016us       6.016us             1  
+                                    aten::empty_strided         3.85%      30.719us         3.85%      30.719us       5.120us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        26.56%     211.935us        26.56%     211.935us      23.548us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.12%      16.940us         2.72%      21.720us       2.413us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.02%       8.121us         1.02%       8.121us       0.541us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.20%       9.582us         1.20%       9.582us       3.194us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.12%       8.930us         1.12%       8.930us       2.977us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.72%       5.780us         0.87%       6.970us       2.323us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.000ms
-Self CUDA time total: 113.694us
+Self CPU time total: 797.960us
+Self CUDA time total: 112.798us
 
 
 
@@ -4302,29 +4310,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         5.97%     120.782us        97.66%       1.975ms       1.975ms       0.000us         0.00%     434.301us     434.301us             1  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     421.021us       106.85%     421.021us     421.021us             1  
-                                           aten::conv1d         0.30%       6.069us         5.79%     117.202us      39.067us       0.000us         0.00%     251.007us      83.669us             3  
-                                      aten::convolution         0.47%       9.471us         5.49%     111.133us      37.044us       0.000us         0.00%     251.007us      83.669us             3  
-                                     aten::_convolution         1.10%      22.180us         5.03%     101.662us      33.887us       0.000us         0.00%     251.007us      83.669us             3  
-                                aten::_conv_depthwise2d         1.13%      22.779us         3.17%      64.182us      21.394us     251.007us        63.71%     251.007us      83.669us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     251.007us        63.71%     251.007us      83.669us             3  
-                                               aten::to         0.31%       6.200us        84.52%       1.710ms     284.917us       0.000us         0.00%     183.294us      30.549us             6  
-                                         aten::_to_copy         1.19%      24.072us        84.22%       1.703ms     283.884us       0.000us         0.00%     183.294us      30.549us             6  
-                                            aten::copy_         2.45%      49.593us        81.56%       1.650ms     274.942us     143.007us        36.29%     183.294us      30.549us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     102.495us        26.01%     102.495us      34.165us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.512us        10.28%      40.512us      13.504us             3  
-                                Activity Buffer Request        70.36%       1.423ms        70.36%       1.423ms       1.423ms      40.287us        10.22%      40.287us      40.287us             1  
-                                    aten::empty_strided         1.46%      29.579us         1.46%      29.579us       4.930us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         9.86%     199.474us         9.86%     199.474us      22.164us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.84%      17.021us         1.11%      22.432us       2.492us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.45%       9.090us         0.45%       9.090us       0.606us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.48%       9.720us         0.48%       9.720us       3.240us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.45%       9.202us         0.45%       9.202us       3.067us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.28%       5.680us         0.35%       7.060us       2.353us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager        13.79%     117.896us        93.85%     802.069us     802.069us       0.000us         0.00%     432.858us     432.858us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     419.356us       106.55%     419.356us     419.356us             1  
+                                           aten::conv1d         0.66%       5.648us        13.24%     113.161us      37.720us       0.000us         0.00%     251.262us      83.754us             3  
+                                      aten::convolution         1.11%       9.481us        12.58%     107.513us      35.838us       0.000us         0.00%     251.262us      83.754us             3  
+                                     aten::_convolution         2.53%      21.627us        11.47%      98.032us      32.677us       0.000us         0.00%     251.262us      83.754us             3  
+                                aten::_conv_depthwise2d         2.35%      20.121us         7.14%      61.002us      20.334us     251.262us        63.84%     251.262us      83.754us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     251.262us        63.84%     251.262us      83.754us             3  
+                                               aten::to         0.66%       5.670us        63.66%     544.101us      90.683us       0.000us         0.00%     181.596us      30.266us             6  
+                                         aten::_to_copy         2.68%      22.880us        63.00%     538.431us      89.739us       0.000us         0.00%     181.596us      30.266us             6  
+                                            aten::copy_         6.04%      51.591us        56.88%     486.161us      81.027us     142.333us        36.16%     181.596us      30.266us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     102.271us        25.98%     102.271us      34.090us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.062us        10.18%      40.062us      13.354us             3  
+                                Activity Buffer Request        28.73%     245.556us        28.73%     245.556us     245.556us      39.263us         9.98%      39.263us      39.263us             1  
+                                    aten::empty_strided         3.44%      29.390us         3.44%      29.390us       4.898us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        24.77%     211.714us        24.77%     211.714us      23.524us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         1.99%      17.042us         2.56%      21.904us       2.434us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.01%       8.621us         1.01%       8.621us       0.575us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.11%       9.471us         1.11%       9.471us       3.157us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.02%       8.710us         1.02%       8.710us       2.903us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.71%       6.031us         0.84%       7.190us       2.397us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.023ms
-Self CUDA time total: 394.014us
+Self CPU time total: 854.650us
+Self CUDA time total: 393.595us
 
 
 
@@ -4334,29 +4342,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         5.89%     122.072us        95.29%       1.975ms       1.975ms       0.000us         0.00%     486.458us     486.458us             1  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     474.010us       106.16%     474.010us     474.010us             1  
-                                           aten::conv1d         0.28%       5.830us         5.59%     115.853us      38.618us       0.000us         0.00%     299.291us      99.764us             3  
-                                      aten::convolution         0.46%       9.610us         5.31%     110.023us      36.674us       0.000us         0.00%     299.291us      99.764us             3  
-                                     aten::_convolution         1.08%      22.439us         4.85%     100.413us      33.471us       0.000us         0.00%     299.291us      99.764us             3  
-                                aten::_conv_depthwise2d         1.04%      21.490us         3.04%      62.983us      20.994us     299.291us        67.03%     299.291us      99.764us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     299.291us        67.03%     299.291us      99.764us             3  
-                                               aten::to         0.31%       6.341us        82.51%       1.710ms     284.962us       0.000us         0.00%     187.167us      31.195us             6  
-                                         aten::_to_copy         1.23%      25.592us        82.20%       1.703ms     283.906us       0.000us         0.00%     187.167us      31.195us             6  
-                                            aten::copy_         2.39%      49.481us        79.48%       1.647ms     274.512us     147.199us        32.97%     187.167us      31.195us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     106.911us        23.94%     106.911us      35.637us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.288us         9.02%      40.288us      13.429us             3  
-                                Activity Buffer Request        68.62%       1.422ms        68.62%       1.422ms       1.422ms      39.968us         8.95%      39.968us      39.968us             1  
-                                    aten::empty_strided         1.48%      30.770us         1.48%      30.770us       5.128us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         9.53%     197.485us         9.53%     197.485us      21.943us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.81%      16.791us         1.08%      22.301us       2.478us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.44%       9.141us         0.44%       9.141us       0.609us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.47%       9.701us         0.47%       9.701us       3.234us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.48%       9.941us         0.48%       9.941us       3.314us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.27%       5.510us         0.33%       6.790us       2.263us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager        12.73%     119.312us        88.90%     833.220us     833.220us       0.000us         0.00%     487.606us     487.606us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     476.503us       106.43%     476.503us     476.503us             1  
+                                           aten::conv1d         0.59%       5.550us        12.38%     116.073us      38.691us       0.000us         0.00%     298.682us      99.561us             3  
+                                      aten::convolution         1.01%       9.430us        11.79%     110.523us      36.841us       0.000us         0.00%     298.682us      99.561us             3  
+                                     aten::_convolution         2.32%      21.781us        10.79%     101.093us      33.698us       0.000us         0.00%     298.682us      99.561us             3  
+                                aten::_conv_depthwise2d         2.19%      20.491us         6.88%      64.493us      21.498us     298.682us        66.71%     298.682us      99.561us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     298.682us        66.71%     298.682us      99.561us             3  
+                                               aten::to         0.60%       5.580us        60.86%     570.404us      95.067us       0.000us         0.00%     188.924us      31.487us             6  
+                                         aten::_to_copy         2.42%      22.662us        60.26%     564.824us      94.137us       0.000us         0.00%     188.924us      31.487us             6  
+                                            aten::copy_         5.33%      49.982us        54.62%     511.981us      85.330us     149.053us        33.29%     188.924us      31.487us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     108.926us        24.33%     108.926us      36.309us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.127us         8.96%      40.127us      13.376us             3  
+                                Activity Buffer Request        29.44%     275.977us        29.44%     275.977us     275.977us      39.871us         8.91%      39.871us      39.871us             1  
+                                    aten::empty_strided         3.22%      30.181us         3.22%      30.181us       5.030us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        22.44%     210.343us        22.44%     210.343us      23.371us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         1.80%      16.910us         2.35%      22.009us       2.445us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.91%       8.519us         0.91%       8.519us       0.568us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.16%      10.891us         1.16%      10.891us       3.630us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.94%       8.790us         0.94%       8.790us       2.930us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.59%       5.500us         0.71%       6.690us       2.230us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.072ms
-Self CUDA time total: 446.490us
+Self CPU time total: 937.282us
+Self CUDA time total: 447.735us
 
 
 
@@ -4366,29 +4374,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     358.523us      1924.96%     358.523us     358.523us             1  
-                                            torch_eager        17.94%     139.773us        99.33%     774.049us     774.049us       0.000us         0.00%      20.513us      20.513us             1  
-                                               aten::to         0.94%       7.351us        62.88%     489.983us      81.664us       0.000us         0.00%      13.376us       2.229us             6  
-                                         aten::_to_copy         3.20%      24.930us        61.93%     482.632us      80.439us       0.000us         0.00%      13.376us       2.229us             6  
-                                            aten::copy_         6.90%      53.742us        54.52%     424.881us      70.813us      11.488us        61.68%      13.376us       2.229us             6  
-                                           aten::conv1d         0.75%       5.841us        15.01%     116.973us      38.991us       0.000us         0.00%       7.137us       2.379us             3  
-                                      aten::convolution         1.33%      10.360us        14.26%     111.132us      37.044us       0.000us         0.00%       7.137us       2.379us             3  
-                                     aten::_convolution         3.01%      23.430us        12.93%     100.772us      33.591us       0.000us         0.00%       7.137us       2.379us             3  
-                                aten::_conv_depthwise2d         2.81%      21.882us         7.98%      62.192us      20.731us       7.137us        38.32%       7.137us       2.379us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.137us        38.32%       7.137us       2.379us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.888us        31.61%       5.888us       1.963us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.600us        30.07%       5.600us       1.867us             3  
-                                Activity Buffer Request        24.98%     194.695us        24.98%     194.695us     194.695us       1.888us        10.14%       1.888us       1.888us             1  
-                                    aten::empty_strided         4.21%      32.821us         4.21%      32.821us       5.470us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        25.28%     197.004us        25.28%     197.004us      21.889us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.16%      16.850us         2.84%      22.160us       2.462us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.13%       8.821us         1.13%       8.821us       0.588us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.22%       9.521us         1.22%       9.521us       3.174us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.31%      10.229us         1.31%      10.229us       3.410us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.74%       5.740us         0.90%       7.020us       2.340us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     323.326us      1733.09%     323.326us     323.326us             1  
+                                            torch_eager        14.06%     116.944us        99.42%     826.859us     826.859us       0.000us         0.00%      20.544us      20.544us             1  
+                                               aten::to         0.72%       5.971us        68.57%     570.283us      95.047us       0.000us         0.00%      13.344us       2.224us             6  
+                                         aten::_to_copy         2.68%      22.330us        67.85%     564.312us      94.052us       0.000us         0.00%      13.344us       2.224us             6  
+                                            aten::copy_         6.25%      51.969us        61.73%     513.371us      85.562us      11.456us        61.41%      13.344us       2.224us             6  
+                                           aten::conv1d         0.66%       5.530us        13.54%     112.622us      37.541us       0.000us         0.00%       7.200us       2.400us             3  
+                                      aten::convolution         1.25%      10.420us        12.88%     107.092us      35.697us       0.000us         0.00%       7.200us       2.400us             3  
+                                     aten::_convolution         2.52%      20.950us        11.62%      96.672us      32.224us       0.000us         0.00%       7.200us       2.400us             3  
+                                aten::_conv_depthwise2d         2.43%      20.241us         7.30%      60.692us      20.231us       7.200us        38.59%       7.200us       2.400us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.200us        38.59%       7.200us       2.400us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.888us        31.56%       5.888us       1.963us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.568us        29.85%       5.568us       1.856us             3  
+                                Activity Buffer Request        31.20%     259.516us        31.20%     259.516us     259.516us       1.888us        10.12%       1.888us       1.888us             1  
+                                    aten::empty_strided         3.44%      28.611us         3.44%      28.611us       4.768us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        26.78%     222.677us        26.78%     222.677us      24.742us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.11%      17.509us         2.72%      22.620us       2.513us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.03%       8.541us         1.03%       8.541us       0.569us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.40%      11.610us         1.40%      11.610us       3.870us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.97%       8.050us         0.97%       8.050us       2.683us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.68%       5.660us         0.82%       6.830us       2.277us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 779.258us
-Self CUDA time total: 18.625us
+Self CPU time total: 831.660us
+Self CUDA time total: 18.656us
 
 
 
@@ -4398,29 +4406,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     328.763us      1698.07%     328.763us     328.763us             1  
-                                            torch_eager        14.65%     115.015us        99.34%     779.670us     779.670us       0.000us         0.00%      21.248us      21.248us             1  
-                                               aten::to         0.80%       6.290us        66.21%     519.631us      86.605us       0.000us         0.00%      13.406us       2.234us             6  
-                                         aten::_to_copy         3.14%      24.649us        65.41%     513.341us      85.557us       0.000us         0.00%      13.406us       2.234us             6  
-                                            aten::copy_         6.80%      53.351us        58.20%     456.761us      76.127us      11.519us        59.50%      13.406us       2.234us             6  
-                                           aten::conv1d         0.75%       5.880us        15.10%     118.484us      39.495us       0.000us         0.00%       7.842us       2.614us             3  
-                                      aten::convolution         1.21%       9.513us        14.35%     112.604us      37.535us       0.000us         0.00%       7.842us       2.614us             3  
-                                     aten::_convolution         2.83%      22.229us        13.14%     103.091us      34.364us       0.000us         0.00%       7.842us       2.614us             3  
-                                aten::_conv_depthwise2d         3.15%      24.720us         8.43%      66.141us      22.047us       7.842us        40.50%       7.842us       2.614us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.842us        40.50%       7.842us       2.614us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.887us        30.41%       5.887us       1.962us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.632us        29.09%       5.632us       1.877us             3  
-                                Activity Buffer Request        29.55%     231.946us        29.55%     231.946us     231.946us       1.887us         9.75%       1.887us       1.887us             1  
-                                    aten::empty_strided         4.07%      31.931us         4.07%      31.931us       5.322us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        24.68%     193.684us        24.68%     193.684us      21.520us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.11%      16.541us         2.75%      21.581us       2.398us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.09%       8.568us         1.09%       8.568us       0.571us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.27%       9.951us         1.27%       9.951us       3.317us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.18%       9.250us         1.18%       9.250us       3.083us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.72%       5.642us         0.89%       6.980us       2.327us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     351.835us      1817.33%     351.835us     351.835us             1  
+                                            torch_eager        14.08%     121.071us        99.43%     854.999us     854.999us       0.000us         0.00%      21.248us      21.248us             1  
+                                               aten::to         0.71%       6.141us        68.62%     590.084us      98.347us       0.000us         0.00%      13.312us       2.219us             6  
+                                         aten::_to_copy         2.73%      23.503us        67.91%     583.943us      97.324us       0.000us         0.00%      13.312us       2.219us             6  
+                                            aten::copy_         6.25%      53.711us        59.45%     511.250us      85.208us      11.424us        59.01%      13.312us       2.219us             6  
+                                           aten::conv1d         0.65%       5.630us        13.53%     116.322us      38.774us       0.000us         0.00%       7.936us       2.645us             3  
+                                      aten::convolution         1.12%       9.630us        12.87%     110.692us      36.897us       0.000us         0.00%       7.936us       2.645us             3  
+                                     aten::_convolution         2.70%      23.181us        11.75%     101.062us      33.687us       0.000us         0.00%       7.936us       2.645us             3  
+                                aten::_conv_depthwise2d         2.42%      20.779us         7.31%      62.821us      20.940us       7.936us        40.99%       7.936us       2.645us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.936us        40.99%       7.936us       2.645us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.856us        30.25%       5.856us       1.952us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.568us        28.76%       5.568us       1.856us             3  
+                                Activity Buffer Request        31.74%     272.946us        31.74%     272.946us     272.946us       1.888us         9.75%       1.888us       1.888us             1  
+                                    aten::empty_strided         5.72%      49.190us         5.72%      49.190us       8.198us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        24.15%     207.684us        24.15%     207.684us      23.076us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.01%      17.302us         2.65%      22.752us       2.528us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.04%       8.971us         1.04%       8.971us       0.598us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.16%       9.970us         1.16%       9.970us       3.323us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.04%       8.981us         1.04%       8.981us       2.994us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.67%       5.729us         0.81%       6.930us       2.310us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 784.850us
-Self CUDA time total: 19.361us
+Self CPU time total: 859.928us
+Self CUDA time total: 19.360us
 
 
 
@@ -4430,29 +4438,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     330.454us      1698.73%     330.454us     330.454us             1  
-                                            torch_eager        14.50%     115.185us        99.38%     789.290us     789.290us       0.000us         0.00%      21.628us      21.628us             1  
-                                               aten::to         0.75%       5.979us        66.62%     529.132us      88.189us       0.000us         0.00%      14.332us       2.389us             6  
-                                         aten::_to_copy         3.11%      24.732us        65.87%     523.153us      87.192us       0.000us         0.00%      14.332us       2.389us             6  
-                                            aten::copy_         6.75%      53.590us        58.69%     466.101us      77.684us      12.157us        62.49%      14.332us       2.389us             6  
-                                           aten::conv1d         0.72%       5.740us        14.75%     117.122us      39.041us       0.000us         0.00%       7.296us       2.432us             3  
-                                      aten::convolution         1.18%       9.359us        14.02%     111.382us      37.127us       0.000us         0.00%       7.296us       2.432us             3  
-                                     aten::_convolution         2.82%      22.362us        12.85%     102.023us      34.008us       0.000us         0.00%       7.296us       2.432us             3  
-                                aten::_conv_depthwise2d         2.86%      22.741us         8.10%      64.351us      21.450us       7.296us        37.51%       7.296us       2.432us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.296us        37.51%       7.296us       2.432us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.238us        32.07%       6.238us       2.079us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.919us        30.43%       5.919us       1.973us             3  
-                                Activity Buffer Request        30.19%     239.746us        30.19%     239.746us     239.746us       2.175us        11.18%       2.175us       2.175us             1  
-                                    aten::empty_strided         4.07%      32.320us         4.07%      32.320us       5.387us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        24.58%     195.235us        24.58%     195.235us      21.693us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.10%      16.713us         2.76%      21.891us       2.432us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.12%       8.919us         1.12%       8.919us       0.595us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.20%       9.570us         1.20%       9.570us       3.190us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.20%       9.570us         1.20%       9.570us       3.190us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.72%       5.709us         0.89%       7.030us       2.343us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     336.603us      1730.16%     336.603us     336.603us             1  
+                                            torch_eager         7.08%     144.052us        99.72%       2.028ms       2.028ms       0.000us         0.00%      21.631us      21.631us             1  
+                                               aten::to         0.35%       7.181us        85.48%       1.738ms     289.712us       0.000us         0.00%      14.367us       2.394us             6  
+                                         aten::_to_copy         1.19%      24.130us        85.13%       1.731ms     288.515us       0.000us         0.00%      14.367us       2.394us             6  
+                                            aten::copy_         2.47%      50.222us        82.49%       1.678ms     279.593us      12.191us        62.66%      14.367us       2.394us             6  
+                                           aten::conv1d         0.31%       6.211us         5.80%     117.993us      39.331us       0.000us         0.00%       7.264us       2.421us             3  
+                                      aten::convolution         0.49%       9.910us         5.50%     111.782us      37.261us       0.000us         0.00%       7.264us       2.421us             3  
+                                     aten::_convolution         1.11%      22.590us         5.01%     101.872us      33.957us       0.000us         0.00%       7.264us       2.421us             3  
+                                aten::_conv_depthwise2d         1.01%      20.531us         3.08%      62.662us      20.887us       7.264us        37.34%       7.264us       2.421us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.264us        37.34%       7.264us       2.421us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.272us        32.24%       6.272us       2.091us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.919us        30.42%       5.919us       1.973us             3  
+                                Activity Buffer Request        70.94%       1.443ms        70.94%       1.443ms       1.443ms       2.176us        11.18%       2.176us       2.176us             1  
+                                    aten::empty_strided         1.45%      29.401us         1.45%      29.401us       4.900us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.12%     205.814us        10.12%     205.814us      22.868us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.93%      19.000us         1.20%      24.310us       2.701us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.43%       8.650us         0.43%       8.650us       0.577us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.52%      10.541us         0.52%      10.541us       3.514us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.51%      10.450us         0.51%      10.450us       3.483us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.30%       6.000us         0.36%       7.230us       2.410us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 794.200us
-Self CUDA time total: 19.453us
+Self CPU time total: 2.034ms
+Self CUDA time total: 19.455us
 
 
 
@@ -4462,29 +4470,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     325.021us      1622.51%     325.021us     325.021us             1  
-                                            torch_eager        14.95%     114.725us        99.33%     762.279us     762.279us       0.000us         0.00%      22.176us      22.176us             1  
-                                               aten::to         0.78%       5.949us        65.87%     505.530us      84.255us       0.000us         0.00%      14.272us       2.379us             6  
-                                         aten::_to_copy         3.19%      24.509us        65.10%     499.581us      83.264us       0.000us         0.00%      14.272us       2.379us             6  
-                                            aten::copy_         6.59%      50.599us        57.97%     444.890us      74.148us      12.128us        60.54%      14.272us       2.379us             6  
-                                           aten::conv1d         0.79%       6.100us        15.11%     115.973us      38.658us       0.000us         0.00%       7.904us       2.635us             3  
-                                      aten::convolution         1.34%      10.290us        14.32%     109.873us      36.624us       0.000us         0.00%       7.904us       2.635us             3  
-                                     aten::_convolution         2.97%      22.812us        12.98%      99.583us      33.194us       0.000us         0.00%       7.904us       2.635us             3  
-                                aten::_conv_depthwise2d         2.93%      22.501us         8.10%      62.182us      20.727us       7.904us        39.46%       7.904us       2.635us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.904us        39.46%       7.904us       2.635us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.208us        30.99%       6.208us       2.069us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.920us        29.55%       5.920us       1.973us             3  
-                                Activity Buffer Request        28.71%     220.306us        28.71%     220.306us     220.306us       2.144us        10.70%       2.144us       2.144us             1  
-                                    aten::empty_strided         3.93%      30.182us         3.93%      30.182us       5.030us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        25.32%     194.286us        25.32%     194.286us      21.587us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.11%      16.159us         2.76%      21.209us       2.357us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.09%       8.360us         1.09%       8.360us       0.557us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.23%       9.450us         1.23%       9.450us       3.150us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.29%       9.930us         1.29%       9.930us       3.310us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.71%       5.470us         0.87%       6.670us       2.223us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     330.813us      1648.79%     330.813us     330.813us             1  
+                                            torch_eager        15.21%     119.271us        99.33%     778.918us     778.918us       0.000us         0.00%      22.208us      22.208us             1  
+                                               aten::to         0.72%       5.611us        65.71%     515.312us      85.885us       0.000us         0.00%      14.305us       2.384us             6  
+                                         aten::_to_copy         2.87%      22.510us        65.00%     509.701us      84.950us       0.000us         0.00%      14.305us       2.384us             6  
+                                            aten::copy_         6.38%      50.021us        58.03%     455.090us      75.848us      12.161us        60.61%      14.305us       2.384us             6  
+                                           aten::conv1d         0.69%       5.380us        15.11%     118.473us      39.491us       0.000us         0.00%       7.903us       2.634us             3  
+                                      aten::convolution         1.31%      10.292us        14.42%     113.093us      37.698us       0.000us         0.00%       7.903us       2.634us             3  
+                                     aten::_convolution         2.98%      23.360us        13.11%     102.801us      34.267us       0.000us         0.00%       7.903us       2.634us             3  
+                                aten::_conv_depthwise2d         2.80%      21.952us         8.17%      64.041us      21.347us       7.903us        39.39%       7.903us       2.634us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.903us        39.39%       7.903us       2.634us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.240us        31.10%       6.240us       2.080us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.921us        29.51%       5.921us       1.974us             3  
+                                Activity Buffer Request        27.43%     215.065us        27.43%     215.065us     215.065us       2.144us        10.69%       2.144us       2.144us             1  
+                                    aten::empty_strided         4.09%      32.101us         4.09%      32.101us       5.350us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        27.10%     212.544us        27.10%     212.544us      23.616us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.14%      16.752us         2.74%      21.481us       2.387us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.03%       8.081us         1.03%       8.081us       0.539us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.34%      10.539us         1.34%      10.539us       3.513us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.15%       9.010us         1.15%       9.010us       3.003us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.74%       5.829us         0.90%       7.070us       2.357us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 767.429us
-Self CUDA time total: 20.032us
+Self CPU time total: 784.179us
+Self CUDA time total: 20.064us
 
 
 
@@ -4494,29 +4502,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     356.764us       983.15%     356.764us     356.764us             1  
-                                            torch_eager        15.53%     123.844us        99.36%     792.350us     792.350us       0.000us         0.00%      38.944us      38.944us             1  
-                                           aten::conv1d         0.79%       6.320us        15.33%     122.233us      40.744us       0.000us         0.00%      20.320us       6.773us             3  
-                                      aten::convolution         1.24%       9.851us        14.54%     115.913us      38.638us       0.000us         0.00%      20.320us       6.773us             3  
-                                     aten::_convolution         2.89%      23.052us        13.30%     106.062us      35.354us       0.000us         0.00%      20.320us       6.773us             3  
-                                aten::_conv_depthwise2d         2.97%      23.692us         8.39%      66.891us      22.297us      20.320us        56.00%      20.320us       6.773us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      20.320us        56.00%      20.320us       6.773us             3  
-                                               aten::to         0.80%       6.349us        64.76%     516.391us      86.065us       0.000us         0.00%      18.624us       3.104us             6  
-                                         aten::_to_copy         3.21%      25.572us        63.96%     510.042us      85.007us       0.000us         0.00%      18.624us       3.104us             6  
-                                            aten::copy_         6.54%      52.120us        56.52%     450.739us      75.123us      15.968us        44.00%      18.624us       3.104us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.607us        23.72%       8.607us       2.869us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.361us        20.28%       7.361us       2.454us             3  
-                                Activity Buffer Request        27.46%     218.966us        27.46%     218.966us     218.966us       2.656us         7.32%       2.656us       2.656us             1  
-                                    aten::empty_strided         4.23%      33.731us         4.23%      33.731us       5.622us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        25.38%     202.413us        25.38%     202.413us      22.490us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.20%      17.520us         2.88%      22.939us       2.549us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.21%       9.679us         1.21%       9.679us       0.645us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.40%      11.140us         1.40%      11.140us       3.713us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.17%       9.299us         1.17%       9.299us       3.100us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.75%       6.010us         0.93%       7.450us       2.483us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     332.020us       920.74%     332.020us     332.020us             1  
+                                            torch_eager        14.90%     117.475us        99.36%     783.438us     783.438us       0.000us         0.00%      38.651us      38.651us             1  
+                                           aten::conv1d         0.68%       5.380us        14.48%     114.172us      38.057us       0.000us         0.00%      20.190us       6.730us             3  
+                                      aten::convolution         1.18%       9.340us        13.80%     108.792us      36.264us       0.000us         0.00%      20.190us       6.730us             3  
+                                     aten::_convolution         2.79%      21.980us        12.61%      99.452us      33.151us       0.000us         0.00%      20.190us       6.730us             3  
+                                aten::_conv_depthwise2d         2.62%      20.631us         7.92%      62.452us      20.817us      20.190us        55.99%      20.190us       6.730us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      20.190us        55.99%      20.190us       6.730us             3  
+                                               aten::to         0.79%       6.191us        66.58%     524.962us      87.494us       0.000us         0.00%      18.461us       3.077us             6  
+                                         aten::_to_copy         2.94%      23.190us        65.79%     518.771us      86.462us       0.000us         0.00%      18.461us       3.077us             6  
+                                            aten::copy_         6.46%      50.920us        58.97%     464.950us      77.492us      15.870us        44.01%      18.461us       3.077us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.479us        23.51%       8.479us       2.826us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.391us        20.50%       7.391us       2.464us             3  
+                                Activity Buffer Request        28.77%     226.875us        28.77%     226.875us     226.875us       2.591us         7.19%       2.591us       2.591us             1  
+                                    aten::empty_strided         3.88%      30.631us         3.88%      30.631us       5.105us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        26.51%     209.015us        26.51%     209.015us      23.224us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.16%      17.000us         2.79%      22.010us       2.446us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.07%       8.441us         1.07%       8.441us       0.563us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.46%      11.521us         1.46%      11.521us       3.840us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.07%       8.440us         1.07%       8.440us       2.813us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.73%       5.720us         0.87%       6.850us       2.283us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 797.430us
-Self CUDA time total: 36.288us
+Self CPU time total: 788.468us
+Self CUDA time total: 36.060us
 
 
 
@@ -4526,29 +4534,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     332.353us       866.25%     332.353us     332.353us             1  
-                                            torch_eager         6.20%     124.083us        99.73%       1.997ms       1.997ms       0.000us         0.00%      40.959us      40.959us             1  
-                                           aten::conv1d         0.30%       6.071us         5.74%     115.013us      38.338us       0.000us         0.00%      22.592us       7.531us             3  
-                                      aten::convolution         0.48%       9.660us         5.44%     108.942us      36.314us       0.000us         0.00%      22.592us       7.531us             3  
-                                     aten::_convolution         1.09%      21.840us         4.96%      99.282us      33.094us       0.000us         0.00%      22.592us       7.531us             3  
-                                aten::_conv_depthwise2d         1.15%      22.991us         3.11%      62.342us      20.781us      22.592us        58.88%      22.592us       7.531us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      22.592us        58.88%      22.592us       7.531us             3  
-                                               aten::to         0.32%       6.339us        86.44%       1.731ms     288.505us       0.000us         0.00%      18.367us       3.061us             6  
-                                         aten::_to_copy         1.25%      24.980us        86.12%       1.725ms     287.449us       0.000us         0.00%      18.367us       3.061us             6  
-                                            aten::copy_         2.51%      50.252us        83.36%       1.669ms     278.222us      15.775us        41.12%      18.367us       3.061us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.416us        21.94%       8.416us       2.805us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.359us        19.18%       7.359us       2.453us             3  
-                                Activity Buffer Request        72.13%       1.445ms        72.13%       1.445ms       1.445ms       2.592us         6.76%       2.592us       2.592us             1  
-                                    aten::empty_strided         1.52%      30.382us         1.52%      30.382us       5.064us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         9.74%     194.985us         9.74%     194.985us      21.665us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.87%      17.330us         1.13%      22.630us       2.514us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.45%       8.941us         0.45%       8.941us       0.596us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.48%       9.610us         0.48%       9.610us       3.203us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.46%       9.250us         0.46%       9.250us       3.083us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.27%       5.490us         0.34%       6.780us       2.260us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     323.480us       850.14%     323.480us     323.480us             1  
+                                            torch_eager        14.55%     115.293us        99.34%     787.399us     787.399us       0.000us         0.00%      40.643us      40.643us             1  
+                                           aten::conv1d         0.68%       5.400us        14.40%     114.153us      38.051us       0.000us         0.00%      22.336us       7.445us             3  
+                                      aten::convolution         1.16%       9.210us        13.72%     108.753us      36.251us       0.000us         0.00%      22.336us       7.445us             3  
+                                     aten::_convolution         2.80%      22.227us        12.56%      99.543us      33.181us       0.000us         0.00%      22.336us       7.445us             3  
+                                aten::_conv_depthwise2d         2.52%      20.003us         7.87%      62.343us      20.781us      22.336us        58.70%      22.336us       7.445us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      22.336us        58.70%      22.336us       7.445us             3  
+                                               aten::to         0.94%       7.450us        67.15%     532.253us      88.709us       0.000us         0.00%      18.307us       3.051us             6  
+                                         aten::_to_copy         2.90%      22.999us        66.21%     524.803us      87.467us       0.000us         0.00%      18.307us       3.051us             6  
+                                            aten::copy_         6.35%      50.294us        59.67%     472.953us      78.825us      15.714us        41.30%      18.307us       3.051us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.321us        21.87%       8.321us       2.774us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.393us        19.43%       7.393us       2.464us             3  
+                                Activity Buffer Request        30.44%     241.286us        30.44%     241.286us     241.286us       2.593us         6.81%       2.593us       2.593us             1  
+                                    aten::empty_strided         3.64%      28.851us         3.64%      28.851us       4.808us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        25.80%     204.463us        25.80%     204.463us      22.718us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.08%      16.472us         2.71%      21.512us       2.390us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.07%       8.500us         1.07%       8.500us       0.567us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.34%      10.600us         1.34%      10.600us       3.533us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.09%       8.650us         1.09%       8.650us       2.883us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.71%       5.641us         0.87%       6.891us       2.297us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.003ms
-Self CUDA time total: 38.367us
+Self CPU time total: 792.609us
+Self CUDA time total: 38.050us
 
 
 
@@ -4558,29 +4566,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     328.952us       509.17%     328.952us     328.952us             1  
-                                            torch_eager        15.31%     114.903us        99.32%     745.599us     745.599us       0.000us         0.00%      68.701us      68.701us             1  
-                                           aten::conv1d         0.89%       6.660us        15.50%     116.373us      38.791us       0.000us         0.00%      42.238us      14.079us             3  
-                                      aten::convolution         1.33%       9.952us        14.61%     109.713us      36.571us       0.000us         0.00%      42.238us      14.079us             3  
-                                     aten::_convolution         2.95%      22.149us        13.29%      99.761us      33.254us       0.000us         0.00%      42.238us      14.079us             3  
-                                aten::_conv_depthwise2d         2.94%      22.090us         8.38%      62.891us      20.964us      42.238us        65.38%      42.238us      14.079us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      42.238us        65.38%      42.238us      14.079us             3  
-                                               aten::to         0.80%       6.039us        65.05%     488.341us      81.390us       0.000us         0.00%      26.463us       4.410us             6  
-                                         aten::_to_copy         3.23%      24.281us        64.25%     482.302us      80.384us       0.000us         0.00%      26.463us       4.410us             6  
-                                            aten::copy_         6.57%      49.302us        56.69%     425.561us      70.927us      22.367us        34.62%      26.463us       4.410us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      11.936us        18.48%      11.936us       3.979us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.431us        16.15%      10.431us       3.477us             3  
-                                Activity Buffer Request        26.58%     199.565us        26.58%     199.565us     199.565us       4.096us         6.34%       4.096us       4.096us             1  
-                                    aten::empty_strided         4.32%      32.460us         4.32%      32.460us       5.410us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        26.45%     198.565us        26.45%     198.565us      22.063us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.13%      16.001us         2.81%      21.091us       2.343us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.16%       8.690us         1.16%       8.690us       0.579us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.26%       9.490us         1.26%       9.490us       3.163us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.26%       9.440us         1.26%       9.440us       3.147us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.75%       5.611us         0.93%       6.981us       2.327us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     336.347us       525.30%     336.347us     336.347us             1  
+                                            torch_eager        15.07%     122.512us        99.37%     807.929us     807.929us       0.000us         0.00%      68.125us      68.125us             1  
+                                           aten::conv1d         0.67%       5.471us        14.06%     114.283us      38.094us       0.000us         0.00%      41.663us      13.888us             3  
+                                      aten::convolution         1.12%       9.100us        13.38%     108.812us      36.271us       0.000us         0.00%      41.663us      13.888us             3  
+                                     aten::_convolution         2.67%      21.730us        12.26%      99.712us      33.237us       0.000us         0.00%      41.663us      13.888us             3  
+                                aten::_conv_depthwise2d         2.50%      20.351us         7.64%      62.102us      20.701us      41.663us        65.07%      41.663us      13.888us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      41.663us        65.07%      41.663us      13.888us             3  
+                                               aten::to         0.85%       6.891us        66.79%     543.023us      90.504us       0.000us         0.00%      26.462us       4.410us             6  
+                                         aten::_to_copy         2.85%      23.179us        65.94%     536.132us      89.355us       0.000us         0.00%      26.462us       4.410us             6  
+                                            aten::copy_         6.23%      50.622us        59.33%     482.372us      80.395us      22.367us        34.93%      26.462us       4.410us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      11.935us        18.64%      11.935us       3.978us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.432us        16.29%      10.432us       3.477us             3  
+                                Activity Buffer Request        30.75%     250.036us        30.75%     250.036us     250.036us       4.095us         6.40%       4.095us       4.095us             1  
+                                    aten::empty_strided         3.76%      30.581us         3.76%      30.581us       5.097us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        25.31%     205.766us        25.31%     205.766us      22.863us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.22%      18.070us         2.95%      23.970us       2.663us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.13%       9.220us         1.13%       9.220us       0.615us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.19%       9.690us         1.19%       9.690us       3.230us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.99%       8.009us         0.99%       8.009us       2.670us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.69%       5.650us         0.84%       6.800us       2.267us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 750.709us
-Self CUDA time total: 64.605us
+Self CPU time total: 813.049us
+Self CUDA time total: 64.030us
 
 
 
@@ -4590,29 +4598,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     328.798us       467.68%     328.798us     328.798us             1  
-                                            torch_eager        14.69%     115.264us        99.37%     779.669us     779.669us       0.000us         0.00%      74.432us      74.432us             1  
-                                           aten::conv1d         0.75%       5.869us        14.89%     116.853us      38.951us       0.000us         0.00%      47.840us      15.947us             3  
-                                      aten::convolution         1.20%       9.412us        14.15%     110.984us      36.995us       0.000us         0.00%      47.840us      15.947us             3  
-                                     aten::_convolution         2.99%      23.451us        12.95%     101.572us      33.857us       0.000us         0.00%      47.840us      15.947us             3  
-                                aten::_conv_depthwise2d         2.71%      21.281us         8.10%      63.532us      21.177us      47.840us        68.05%      47.840us      15.947us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      47.840us        68.05%      47.840us      15.947us             3  
-                                               aten::to         0.74%       5.828us        66.46%     521.411us      86.902us       0.000us         0.00%      26.592us       4.432us             6  
-                                         aten::_to_copy         3.27%      25.622us        65.71%     515.583us      85.931us       0.000us         0.00%      26.592us       4.432us             6  
-                                            aten::copy_         6.42%      50.382us        58.46%     458.651us      76.442us      22.464us        31.95%      26.592us       4.432us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      12.032us        17.11%      12.032us       4.011us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.432us        14.84%      10.432us       3.477us             3  
-                                Activity Buffer Request        29.93%     234.846us        29.93%     234.846us     234.846us       4.128us         5.87%       4.128us       4.128us             1  
-                                    aten::empty_strided         3.99%      31.310us         3.99%      31.310us       5.218us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        24.83%     194.803us        24.83%     194.803us      21.645us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.07%      16.243us         2.72%      21.332us       2.370us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.07%       8.401us         1.07%       8.401us       0.560us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.35%      10.581us         1.35%      10.581us       3.527us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.31%      10.290us         1.31%      10.290us       3.430us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.69%       5.406us         0.84%       6.568us       2.189us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     325.852us       467.34%     325.852us     325.852us             1  
+                                            torch_eager        14.81%     118.946us        99.39%     798.399us     798.399us       0.000us         0.00%      73.789us      73.789us             1  
+                                           aten::conv1d         0.70%       5.610us        14.26%     114.513us      38.171us       0.000us         0.00%      47.294us      15.765us             3  
+                                      aten::convolution         1.17%       9.382us        13.56%     108.903us      36.301us       0.000us         0.00%      47.294us      15.765us             3  
+                                     aten::_convolution         2.75%      22.119us        12.39%      99.521us      33.174us       0.000us         0.00%      47.294us      15.765us             3  
+                                aten::_conv_depthwise2d         2.53%      20.361us         7.64%      61.351us      20.450us      47.294us        67.83%      47.294us      15.765us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      47.294us        67.83%      47.294us      15.765us             3  
+                                               aten::to         0.79%       6.379us        67.07%     538.781us      89.797us       0.000us         0.00%      26.495us       4.416us             6  
+                                         aten::_to_copy         2.79%      22.401us        66.28%     532.402us      88.734us       0.000us         0.00%      26.495us       4.416us             6  
+                                            aten::copy_         6.32%      50.749us        59.88%     480.970us      80.162us      22.431us        32.17%      26.495us       4.416us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      12.031us        17.25%      12.031us       4.010us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.400us        14.92%      10.400us       3.467us             3  
+                                Activity Buffer Request        31.04%     249.326us        31.04%     249.326us     249.326us       4.064us         5.83%       4.064us       4.064us             1  
+                                    aten::empty_strided         3.61%      29.031us         3.61%      29.031us       4.839us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        25.24%     202.725us        25.24%     202.725us      22.525us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.08%      16.713us         2.70%      21.680us       2.409us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.07%       8.568us         1.07%       8.568us       0.571us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.32%      10.569us         1.32%      10.569us       3.523us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.07%       8.591us         1.07%       8.591us       2.864us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.82%       6.609us         1.00%       8.010us       2.670us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 784.589us
-Self CUDA time total: 70.304us
+Self CPU time total: 803.260us
+Self CUDA time total: 69.725us
 
 
 
@@ -4622,29 +4630,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     341.882us       182.91%     341.882us     341.882us             1  
-                                            torch_eager        15.14%     117.185us        99.33%     768.879us     768.879us       0.000us         0.00%     197.117us     197.117us             1  
-                                           aten::conv1d         0.79%       6.110us        14.86%     114.993us      38.331us       0.000us         0.00%     134.270us      44.757us             3  
-                                      aten::convolution         1.22%       9.451us        14.07%     108.883us      36.294us       0.000us         0.00%     134.270us      44.757us             3  
-                                     aten::_convolution         2.87%      22.240us        12.85%      99.432us      33.144us       0.000us         0.00%     134.270us      44.757us             3  
-                                aten::_conv_depthwise2d         2.84%      21.991us         8.04%      62.222us      20.741us     134.270us        71.84%     134.270us      44.757us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     134.270us        71.84%     134.270us      44.757us             3  
-                                               aten::to         0.77%       5.950us        65.77%     509.102us      84.850us       0.000us         0.00%      62.847us      10.474us             6  
-                                         aten::_to_copy         3.29%      25.489us        65.00%     503.152us      83.859us       0.000us         0.00%      62.847us      10.474us             6  
-                                            aten::copy_         6.45%      49.889us        57.58%     445.721us      74.287us      52.639us        28.16%      62.847us      10.474us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      29.728us        15.91%      29.728us       9.909us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.911us        12.26%      22.911us       7.637us             3  
-                                Activity Buffer Request        28.61%     221.416us        28.61%     221.416us     221.416us      10.208us         5.46%      10.208us      10.208us             1  
-                                    aten::empty_strided         4.13%      31.942us         4.13%      31.942us       5.324us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        25.24%     195.386us        25.24%     195.386us      21.710us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.14%      16.602us         2.90%      22.460us       2.496us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.19%       9.247us         1.19%       9.247us       0.616us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.23%       9.500us         1.23%       9.500us       3.167us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.26%       9.761us         1.26%       9.761us       3.254us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.71%       5.470us         0.87%       6.700us       2.233us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     346.241us       186.20%     346.241us     346.241us             1  
+                                            torch_eager         7.14%     146.093us        99.75%       2.041ms       2.041ms       0.000us         0.00%     196.000us     196.000us             1  
+                                           aten::conv1d         0.31%       6.251us         5.79%     118.533us      39.511us       0.000us         0.00%     133.248us      44.416us             3  
+                                      aten::convolution         0.51%      10.359us         5.49%     112.282us      37.427us       0.000us         0.00%     133.248us      44.416us             3  
+                                     aten::_convolution         1.19%      24.280us         4.98%     101.923us      33.974us       0.000us         0.00%     133.248us      44.416us             3  
+                                aten::_conv_depthwise2d         1.04%      21.191us         3.02%      61.762us      20.587us     133.248us        71.66%     133.248us      44.416us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     133.248us        71.66%     133.248us      44.416us             3  
+                                               aten::to         0.32%       6.489us        85.48%       1.749ms     291.443us       0.000us         0.00%      62.752us      10.459us             6  
+                                         aten::_to_copy         1.16%      23.832us        85.16%       1.742ms     290.362us       0.000us         0.00%      62.752us      10.459us             6  
+                                            aten::copy_         2.53%      51.751us        82.59%       1.689ms     281.575us      52.704us        28.34%      62.752us      10.459us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      29.760us        16.00%      29.760us       9.920us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.944us        12.34%      22.944us       7.648us             3  
+                                Activity Buffer Request        71.04%       1.453ms        71.04%       1.453ms       1.453ms      10.048us         5.40%      10.048us      10.048us             1  
+                                    aten::empty_strided         1.41%      28.891us         1.41%      28.891us       4.815us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.04%     205.324us        10.04%     205.324us      22.814us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.86%      17.550us         1.11%      22.661us       2.518us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.42%       8.641us         0.42%       8.641us       0.576us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.53%      10.750us         0.53%      10.750us       3.583us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.44%       9.021us         0.44%       9.021us       3.007us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.30%       6.220us         0.37%       7.470us       2.490us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 774.039us
-Self CUDA time total: 186.909us
+Self CPU time total: 2.046ms
+Self CUDA time total: 185.952us
 
 
 
@@ -4654,29 +4662,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     349.277us       165.88%     349.277us     349.277us             1  
-                                            torch_eager        15.39%     117.165us        99.36%     756.609us     756.609us       0.000us         0.00%     224.029us     224.029us             1  
-                                           aten::conv1d         0.74%       5.661us        15.33%     116.734us      38.911us       0.000us         0.00%     154.686us      51.562us             3  
-                                      aten::convolution         1.20%       9.150us        14.59%     111.073us      37.024us       0.000us         0.00%     154.686us      51.562us             3  
-                                     aten::_convolution         2.96%      22.532us        13.38%     101.923us      33.974us       0.000us         0.00%     154.686us      51.562us             3  
-                                aten::_conv_depthwise2d         2.86%      21.751us         8.47%      64.492us      21.497us     154.686us        73.47%     154.686us      51.562us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     154.686us        73.47%     154.686us      51.562us             3  
-                                               aten::to         0.84%       6.379us        65.15%     496.150us      82.692us       0.000us         0.00%      69.343us      11.557us             6  
-                                         aten::_to_copy         3.33%      25.371us        64.32%     489.771us      81.628us       0.000us         0.00%      69.343us      11.557us             6  
-                                            aten::copy_         6.44%      49.031us        56.76%     432.240us      72.040us      55.871us        26.53%      69.343us      11.557us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      32.831us        15.59%      32.831us      10.944us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      23.040us        10.94%      23.040us       7.680us             3  
-                                Activity Buffer Request        27.33%     208.145us        27.33%     208.145us     208.145us      13.472us         6.40%      13.472us      13.472us             1  
-                                    aten::empty_strided         4.22%      32.160us         4.22%      32.160us       5.360us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        25.87%     197.025us        25.87%     197.025us      21.892us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.14%      16.329us         2.83%      21.520us       2.391us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.17%       8.932us         1.17%       8.932us       0.595us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.38%      10.500us         1.38%      10.500us       3.500us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.35%      10.280us         1.35%      10.280us       3.427us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.72%       5.468us         0.90%       6.839us       2.280us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     341.281us       162.73%     341.281us     341.281us             1  
+                                            torch_eager        15.25%     117.693us        99.36%     766.878us     766.878us       0.000us         0.00%     223.168us     223.168us             1  
+                                           aten::conv1d         0.68%       5.279us        14.60%     112.702us      37.567us       0.000us         0.00%     154.016us      51.339us             3  
+                                      aten::convolution         1.24%       9.560us        13.92%     107.423us      35.808us       0.000us         0.00%     154.016us      51.339us             3  
+                                     aten::_convolution         2.67%      20.611us        12.68%      97.863us      32.621us       0.000us         0.00%     154.016us      51.339us             3  
+                                aten::_conv_depthwise2d         2.74%      21.170us         8.14%      62.852us      20.951us     154.016us        73.44%     154.016us      51.339us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     154.016us        73.44%     154.016us      51.339us             3  
+                                               aten::to         0.75%       5.750us        66.22%     511.121us      85.187us       0.000us         0.00%      69.152us      11.525us             6  
+                                         aten::_to_copy         2.86%      22.060us        65.48%     505.371us      84.228us       0.000us         0.00%      69.152us      11.525us             6  
+                                            aten::copy_         6.81%      52.581us        58.74%     453.391us      75.565us      55.712us        26.56%      69.152us      11.525us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      32.705us        15.59%      32.705us      10.902us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      23.007us        10.97%      23.007us       7.669us             3  
+                                Activity Buffer Request        28.38%     219.045us        28.38%     219.045us     219.045us      13.440us         6.41%      13.440us      13.440us             1  
+                                    aten::empty_strided         3.88%      29.920us         3.88%      29.920us       4.987us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        26.50%     204.546us        26.50%     204.546us      22.727us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.10%      16.212us         2.69%      20.781us       2.309us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.01%       7.798us         1.01%       7.798us       0.520us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.33%      10.250us         1.33%      10.250us       3.417us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.12%       8.651us         1.12%       8.651us       2.884us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.71%       5.470us         0.87%       6.730us       2.243us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 761.499us
-Self CUDA time total: 210.557us
+Self CPU time total: 771.798us
+Self CUDA time total: 209.728us
 
 
 
@@ -4686,29 +4694,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         6.72%     121.944us        52.58%     953.714us     953.714us       0.000us         0.00%       1.521ms       1.521ms             1  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.421ms       100.41%       1.421ms       1.421ms             1  
-                                               aten::to         0.35%       6.300us        37.63%     682.555us     113.759us       0.000us         0.00%     824.097us     137.350us             6  
-                                         aten::_to_copy         1.68%      30.549us        37.28%     676.255us     112.709us       0.000us         0.00%     824.097us     137.350us             6  
-                                            aten::copy_         2.98%      53.981us        24.83%     450.422us      75.070us     718.817us        50.79%     824.097us     137.350us             6  
-                                           aten::conv1d         0.35%       6.281us         6.65%     120.554us      40.185us       0.000us         0.00%     696.543us     232.181us             3  
-                                      aten::convolution         0.57%      10.251us         6.30%     114.273us      38.091us       0.000us         0.00%     696.543us     232.181us             3  
-                                     aten::_convolution         1.27%      23.111us         5.73%     104.022us      34.674us       0.000us         0.00%     696.543us     232.181us             3  
-                                aten::_conv_depthwise2d         1.23%      22.359us         3.60%      65.321us      21.774us     696.543us        49.21%     696.543us     232.181us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     696.543us        49.21%     696.543us     232.181us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     409.920us        28.96%     409.920us     136.640us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     308.897us        21.82%     308.897us     102.966us             3  
-                                Activity Buffer Request        11.98%     217.246us        11.98%     217.246us     217.246us     105.280us         7.44%     105.280us     105.280us             1  
-                                    aten::empty_strided         2.17%      39.370us        10.77%     195.284us      32.547us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        11.13%     201.976us        11.13%     201.976us      22.442us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.99%      18.030us         1.31%      23.761us       2.640us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.53%       9.620us         0.53%       9.620us       0.641us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.59%      10.751us         0.59%      10.751us       3.584us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.52%       9.430us         0.52%       9.430us       3.143us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.31%       5.670us         0.39%       7.030us       2.343us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         6.80%     123.239us        52.01%     942.341us     942.341us       0.000us         0.00%       1.520ms       1.520ms             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.422ms       100.42%       1.422ms       1.422ms             1  
+                                               aten::to         0.34%       6.231us        36.98%     669.957us     111.660us       0.000us         0.00%     824.603us     137.434us             6  
+                                         aten::_to_copy         1.55%      28.122us        36.63%     663.726us     110.621us       0.000us         0.00%     824.603us     137.434us             6  
+                                            aten::copy_         2.95%      53.430us        24.71%     447.748us      74.625us     720.572us        50.89%     824.603us     137.434us             6  
+                                           aten::conv1d         0.34%       6.111us         6.66%     120.744us      40.248us       0.000us         0.00%     695.357us     231.786us             3  
+                                      aten::convolution         0.56%      10.201us         6.33%     114.633us      38.211us       0.000us         0.00%     695.357us     231.786us             3  
+                                     aten::_convolution         1.36%      24.689us         5.76%     104.432us      34.811us       0.000us         0.00%     695.357us     231.786us             3  
+                                aten::_conv_depthwise2d         1.22%      22.151us         3.50%      63.431us      21.144us     695.357us        49.11%     695.357us     231.786us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     695.357us        49.11%     695.357us     231.786us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     407.263us        28.76%     407.263us     135.754us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     313.309us        22.13%     313.309us     104.436us             3  
+                                Activity Buffer Request        11.46%     207.684us        11.46%     207.684us     207.684us     104.031us         7.35%     104.031us     104.031us             1  
+                                    aten::empty_strided         2.02%      36.603us        10.37%     187.856us      31.309us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        11.47%     207.874us        11.47%     207.874us      23.097us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.99%      18.011us         1.30%      23.581us       2.620us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.51%       9.270us         0.51%       9.270us       0.618us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.60%      10.830us         0.60%      10.830us       3.610us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.51%       9.210us         0.51%       9.210us       3.070us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.35%       6.401us         0.43%       7.711us       2.570us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.814ms
-Self CUDA time total: 1.415ms
+Self CPU time total: 1.812ms
+Self CUDA time total: 1.416ms
 
 
 
@@ -4718,29 +4726,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         4.05%     123.714us        65.96%       2.016ms       2.016ms       0.000us         0.00%       1.502ms       1.502ms             1  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.433ms       100.43%       1.433ms       1.433ms             1  
-                                               aten::to         0.21%       6.507us        56.82%       1.737ms     289.475us       0.000us         0.00%     764.927us     127.488us             6  
-                                         aten::_to_copy         0.85%      25.961us        56.61%       1.730ms     288.391us       0.000us         0.00%     764.927us     127.488us             6  
-                                            aten::copy_         1.76%      53.800us        54.73%       1.673ms     278.832us     689.887us        48.36%     764.927us     127.488us             6  
-                                           aten::conv1d         0.20%       6.220us         4.18%     127.663us      42.554us       0.000us         0.00%     736.735us     245.578us             3  
-                                      aten::convolution         0.34%      10.420us         3.97%     121.443us      40.481us       0.000us         0.00%     736.735us     245.578us             3  
-                                     aten::_convolution         0.75%      22.860us         3.63%     111.023us      37.008us       0.000us         0.00%     736.735us     245.578us             3  
-                                aten::_conv_depthwise2d         0.96%      29.441us         2.37%      72.583us      24.194us     736.735us        51.64%     736.735us     245.578us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     736.735us        51.64%     736.735us     245.578us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     397.471us        27.86%     397.471us     132.490us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     292.416us        20.50%     292.416us      97.472us             3  
-                                Activity Buffer Request        47.26%       1.445ms        47.26%       1.445ms       1.445ms      75.040us         5.26%      75.040us      75.040us             1  
-                                    aten::empty_strided         1.03%      31.391us         1.03%      31.391us       5.232us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         6.45%     197.169us         6.45%     197.169us      21.908us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.57%      17.300us         0.75%      22.850us       2.539us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.30%       9.200us         0.30%       9.200us       0.613us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.32%       9.780us         0.32%       9.780us       3.260us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.36%      10.870us         0.36%      10.870us       3.623us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.19%       5.770us         0.23%       7.180us       2.393us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         6.34%     116.852us        42.22%     778.698us     778.698us       0.000us         0.00%       1.501ms       1.501ms             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.431ms       100.41%       1.431ms       1.431ms             1  
+                                               aten::to         0.32%       5.860us        28.27%     521.352us      86.892us       0.000us         0.00%     762.491us     127.082us             6  
+                                         aten::_to_copy         1.22%      22.580us        27.95%     515.492us      85.915us       0.000us         0.00%     762.491us     127.082us             6  
+                                            aten::copy_         2.82%      52.081us        25.06%     462.219us      77.037us     686.779us        48.19%     762.491us     127.082us             6  
+                                           aten::conv1d         0.30%       5.601us         6.22%     114.743us      38.248us       0.000us         0.00%     738.362us     246.121us             3  
+                                      aten::convolution         0.51%       9.380us         5.92%     109.142us      36.381us       0.000us         0.00%     738.362us     246.121us             3  
+                                     aten::_convolution         1.20%      22.202us         5.41%      99.762us      33.254us       0.000us         0.00%     738.362us     246.121us             3  
+                                aten::_conv_depthwise2d         1.10%      20.251us         3.34%      61.660us      20.553us     738.362us        51.81%     738.362us     246.121us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     738.362us        51.81%     738.362us     246.121us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     398.333us        27.95%     398.333us     132.778us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     288.446us        20.24%     288.446us      96.149us             3  
+                                Activity Buffer Request        12.41%     228.855us        12.41%     228.855us     228.855us      75.712us         5.31%      75.712us      75.712us             1  
+                                    aten::empty_strided         1.66%      30.693us         1.66%      30.693us       5.115us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        11.01%     202.993us        11.01%     202.993us      22.555us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.90%      16.659us         1.18%      21.780us       2.420us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.46%       8.512us         0.46%       8.512us       0.567us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.53%       9.739us         0.53%       9.739us       3.246us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.54%       9.960us         0.54%       9.960us       3.320us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.33%       6.150us         0.40%       7.440us       2.480us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 3.057ms
-Self CUDA time total: 1.427ms
+Self CPU time total: 1.844ms
+Self CUDA time total: 1.425ms
 
 
 impl                     wl                  p50(ms)  ok
@@ -4765,16 +4773,10 @@ torch_eager              cuda_B4_D2048_S512_W4     0.10  True
 torch_eager              cuda_B4_D64_S128_W2     0.08  True
 torch_eager              cuda_B4_D64_S128_W4     0.08  True
 torch_eager              cuda_B4_D64_S2048_W2     0.08  True
-torch_eager              cuda_B4_D64_S2048_W4     0.09  True
+torch_eager              cuda_B4_D64_S2048_W4     0.08  True
 torch_eager              cuda_B4_D64_S512_W2     0.08  True
 torch_eager              cuda_B4_D64_S512_W4     0.08  True
 
-
-
▶ UV Install Logs
- -

Artifacts:

causal_conv1d.jsonl diff --git a/causal_conv1d/results/artifacts/combine/latency.svg b/causal_conv1d/results/artifacts/combine/latency.svg index 07cfbdf7d6b5520fa7d67c8819a8378d9bcd8cb5..6d35d2ab09083e61b698925d53e021616e6652ab 100644 --- a/causal_conv1d/results/artifacts/combine/latency.svg +++ b/causal_conv1d/results/artifacts/combine/latency.svg @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3d92f3a3aa92e11f21958cf1c591a4e709fd40f7b0cccbd544c1e1a77b11bcd2 -size 35429 +oid sha256:1d242a099b34afd09f08c43f438a0f0428d98a0ebd51a9a36d0be25ca9da89df +size 35416 diff --git a/causal_conv1d/results/combined_results.html b/causal_conv1d/results/combined_results.html index 45b22fabef9b9c6a15964465834db2598fd9e481..696a6ce6dd37123c817b131363a2aec4433b982e 100644 --- a/causal_conv1d/results/combined_results.html +++ b/causal_conv1d/results/combined_results.html @@ -809,6 +809,14 @@ .artifact-preview svg { background: transparent; } + /* Invert SVG images in dark mode */ + :root[data-theme="dark"] .artifact-preview img[src$=".svg"] { + filter: invert(0.9) hue-rotate(180deg); + } + /* Keep SVG images readable in monocolor mode */ + :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] { + filter: none; + } /* CSV table styling */ .artifact-csv { margin-top: 1rem; @@ -3872,7 +3880,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: - 2025-10-29T14:27:58.771179 + 2025-10-29T15:50:56.264680 image/svg+xml @@ -4216,70 +4224,70 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: - + - + - 0.1 + 0.1 - + - + - 0.2 + 0.2 - + - + - 0.3 + 0.3 - + - + - 0.4 + 0.4 - + - + - 0.5 + 0.5 @@ -4287,66 +4295,66 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: - + - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + - + - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + @@ -4405,7 +4413,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: combine | 4.32s +Cell: combine | 4.35s | Raw @@ -4498,12 +4506,12 @@ hf_kernels_causal_conv1d cuda_B2_D2048_S2048_W2 0.05 True hf_kernels_causal_conv1d cuda_B2_D2048_S2048_W4 0.05 True hf_kernels_causal_conv1d cuda_B2_D2048_S512_W2 0.05 True hf_kernels_causal_conv1d cuda_B2_D2048_S512_W4 0.05 True -hf_kernels_causal_conv1d cuda_B2_D64_S128_W2 0.05 True -hf_kernels_causal_conv1d cuda_B2_D64_S128_W4 0.06 True -hf_kernels_causal_conv1d cuda_B2_D64_S2048_W2 0.06 True +hf_kernels_causal_conv1d cuda_B2_D64_S128_W2 0.04 True +hf_kernels_causal_conv1d cuda_B2_D64_S128_W4 0.05 True +hf_kernels_causal_conv1d cuda_B2_D64_S2048_W2 0.05 True hf_kernels_causal_conv1d cuda_B2_D64_S2048_W4 0.05 True -hf_kernels_causal_conv1d cuda_B2_D64_S512_W2 0.06 True -hf_kernels_causal_conv1d cuda_B2_D64_S512_W4 0.06 True +hf_kernels_causal_conv1d cuda_B2_D64_S512_W2 0.05 True +hf_kernels_causal_conv1d cuda_B2_D64_S512_W4 0.05 True hf_kernels_causal_conv1d cuda_B4_D2048_S128_W2 0.05 True hf_kernels_causal_conv1d cuda_B4_D2048_S128_W4 0.05 True hf_kernels_causal_conv1d cuda_B4_D2048_S2048_W2 0.05 True @@ -4514,7 +4522,7 @@ hf_kernels_causal_conv1d cuda_B4_D64_S128_W2 0.05 True hf_kernels_causal_conv1d cuda_B4_D64_S128_W4 0.05 True hf_kernels_causal_conv1d cuda_B4_D64_S2048_W2 0.05 True hf_kernels_causal_conv1d cuda_B4_D64_S2048_W4 0.05 True -hf_kernels_causal_conv1d cuda_B4_D64_S512_W2 0.06 True +hf_kernels_causal_conv1d cuda_B4_D64_S512_W2 0.05 True hf_kernels_causal_conv1d cuda_B4_D64_S512_W4 0.05 True torch_eager cuda_B2_D2048_S128_W2 0.08 True torch_eager cuda_B2_D2048_S128_W4 0.08 True @@ -4537,7 +4545,7 @@ torch_eager cuda_B4_D2048_S512_W4 0.10 True torch_eager cuda_B4_D64_S128_W2 0.08 True torch_eager cuda_B4_D64_S128_W4 0.08 True torch_eager cuda_B4_D64_S2048_W2 0.08 True -torch_eager cuda_B4_D64_S2048_W4 0.09 True +torch_eager cuda_B4_D64_S2048_W4 0.08 True torch_eager cuda_B4_D64_S512_W2 0.08 True torch_eager cuda_B4_D64_S512_W4 0.08 True @@ -4559,7 +4567,7 @@ Implementations included:
▶ UV Install Logs
@@ -4572,7 +4580,7 @@ Installed 37 packages in 214ms - 2025-10-29T14:27:58.771179 + 2025-10-29T15:50:56.264680 image/svg+xml @@ -4916,70 +4924,70 @@ Installed 37 packages in 214ms - + - + - 0.1 + 0.1 - + - + - 0.2 + 0.2 - + - + - 0.3 + 0.3 - + - + - 0.4 + 0.4 - + - + - 0.5 + 0.5 @@ -4987,66 +4995,66 @@ Installed 37 packages in 214ms - + - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + - + - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + diff --git a/flash_attn/impls/artifacts/benchmark/attention.jsonl b/flash_attn/impls/artifacts/benchmark/attention.jsonl index dfaf0c99c533e861b9b0cf0a7d640e38745db1c9..87cdd099743a8bb2627fdf26ead020fa05255bc2 100644 --- a/flash_attn/impls/artifacts/benchmark/attention.jsonl +++ b/flash_attn/impls/artifacts/benchmark/attention.jsonl @@ -1,6 +1,6 @@ -{"ts": "2025-10-29T14:27:40Z", "run": "f6f1662cc0064e52be2f08072a7254af", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.98791400000664, "p50": 0.995113999977093, "p90": 1.0003840000081254, "mean": 0.9967803999984426, "iqr": 0.00634899998885885, "raw_times": [0.98791400000664, 0.9940350000192666, 1.006454999981088, 1.0003840000081254, 0.995113999977093], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.0150049999992916, "peak_bytes": 295567360, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null} -{"ts": "2025-10-29T14:27:41Z", "run": "f6f1662cc0064e52be2f08072a7254af", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0233649999804584, "p50": 1.0321449999537435, "p90": 1.0348449999924014, "mean": 1.032277399974646, "iqr": 0.009739000006447895, "raw_times": [1.045925999960673, 1.0251059999859535, 1.0321449999537435, 1.0233649999804584, 1.0348449999924014], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.035865999995167, "peak_bytes": 304742400, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null} -{"ts": "2025-10-29T14:27:41Z", "run": "f6f1662cc0064e52be2f08072a7254af", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0737370000128976, "p50": 1.084086999981082, "p90": 1.088675999994848, "mean": 1.0826705999988917, "iqr": 0.0148400000057336, "raw_times": [1.0738359999891145, 1.0930170000165162, 1.084086999981082, 1.0737370000128976, 1.088675999994848], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.0889670000437945, "peak_bytes": 307494912, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null} -{"ts": "2025-10-29T14:27:41Z", "run": "f6f1662cc0064e52be2f08072a7254af", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0887770000067576, "p50": 1.0916359999555425, "p90": 1.096396999969329, "mean": 1.0932085999797891, "iqr": 0.005600999998023326, "raw_times": [1.0887770000067576, 1.0907959999713057, 1.0916359999555425, 1.0984369999960109, 1.096396999969329], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.1031370000296192, "peak_bytes": 311296000, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null} -{"ts": "2025-10-29T14:27:41Z", "run": "f6f1662cc0064e52be2f08072a7254af", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.2465009999687027, "p50": 1.2523310000460697, "p90": 1.2523909999799798, "mean": 1.2538410000047406, "iqr": 0.005290999979479238, "raw_times": [1.2523310000460697, 1.2523909999799798, 1.2471000000005006, 1.2465009999687027, 1.2708820000284504], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2551809999763464, "peak_bytes": 315621376, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null} -{"ts": "2025-10-29T14:27:41Z", "run": "f6f1662cc0064e52be2f08072a7254af", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.2355700000057368, "p50": 1.241141000036805, "p90": 1.2576200000466997, "mean": 1.2477664000130062, "iqr": 0.02047000003813082, "raw_times": [1.2355700000057368, 1.241141000036805, 1.237150000008569, 1.2576200000466997, 1.2673509999672206], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2579809999806457, "peak_bytes": 319946752, "ok": true, "absmax": 0.125, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.125, "mae": 0.000362396240234375, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null} +{"ts": "2025-10-29T15:50:47Z", "run": "cdb0295471e5453fb4c555529da92783", "impl": "torch_mem_eff", "tags": {"family": "torch-sdpa", "backend": "EFFICIENT", "compile": "none"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.8233430000123008, "p50": 1.8343830000162598, "p90": 1.8450139999686144, "mean": 1.8363673999942876, "iqr": 0.021300000014434772, "raw_times": [1.8450139999686144, 1.8233430000123008, 1.8237139999541796, 1.8343830000162598, 1.8553830000200833], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.8232439999792405, "peak_bytes": 295567360, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null} +{"ts": "2025-10-29T15:50:47Z", "run": "cdb0295471e5453fb4c555529da92783", "impl": "torch_mem_eff", "tags": {"family": "torch-sdpa", "backend": "EFFICIENT", "compile": "none"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.8942840000022443, "p50": 1.9424449999974058, "p90": 1.9434060000094178, "mean": 1.9367254000030698, "iqr": 0.0023400000372930663, "raw_times": [1.8942840000022443, 1.9424449999974058, 1.9410659999721247, 1.9434060000094178, 1.9624260000341565], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.9008649999818772, "peak_bytes": 304742400, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null} +{"ts": "2025-10-29T15:50:48Z", "run": "cdb0295471e5453fb4c555529da92783", "impl": "torch_mem_eff", "tags": {"family": "torch-sdpa", "backend": "EFFICIENT", "compile": "none"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.942595999992136, "p50": 1.9503360000499015, "p90": 2.019877999998698, "mean": 1.9758666000029734, "iqr": 0.0764520000302582, "raw_times": [1.9503360000499015, 1.94342599996844, 1.942595999992136, 2.019877999998698, 2.0230970000056914], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.9501660000287302, "peak_bytes": 307494912, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003566741943359375, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null} +{"ts": "2025-10-29T15:50:48Z", "run": "cdb0295471e5453fb4c555529da92783", "impl": "torch_mem_eff", "tags": {"family": "torch-sdpa", "backend": "EFFICIENT", "compile": "none"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.9654459999856044, "p50": 2.0491880000008678, "p90": 2.050657999973282, "mean": 2.0347600000036437, "iqr": 0.0033989999224104395, "raw_times": [1.9654459999856044, 2.0491880000008678, 2.0472590000508717, 2.050657999973282, 2.0612490000075923], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 2.0352980000097887, "peak_bytes": 311296000, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null} +{"ts": "2025-10-29T15:50:48Z", "run": "cdb0295471e5453fb4c555529da92783", "impl": "torch_mem_eff", "tags": {"family": "torch-sdpa", "backend": "EFFICIENT", "compile": "none"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 2.0188670000038655, "p50": 2.067507999981899, "p90": 2.1027900000376576, "mean": 2.0633722000184207, "iqr": 0.07837300000801406, "raw_times": [2.0188670000038655, 2.0244170000296435, 2.067507999981899, 2.1027900000376576, 2.103279000039038], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 2.0235979999938536, "peak_bytes": 315621376, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null} +{"ts": "2025-10-29T15:50:48Z", "run": "cdb0295471e5453fb4c555529da92783", "impl": "torch_mem_eff", "tags": {"family": "torch-sdpa", "backend": "EFFICIENT", "compile": "none"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 2.1849919999681333, "p50": 2.1887119999632887, "p90": 2.2487329999876238, "mean": 2.212510399988332, "iqr": 0.06324099996390942, "raw_times": [2.1849919999681333, 2.1887119999632887, 2.1854920000237144, 2.2487329999876238, 2.254622999998901], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 2.1668410000188487, "peak_bytes": 319946752, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null} diff --git a/flash_attn/impls/cells/benchmark.py b/flash_attn/impls/cells/benchmark.py index 04ae262009c3d6e33aaa3e392d28c903f24c287c..dd8f743d8e47bc2cce2c7e34675377c075df938f 100644 --- a/flash_attn/impls/cells/benchmark.py +++ b/flash_attn/impls/cells/benchmark.py @@ -4,7 +4,6 @@ # "numpy", # "torch==2.8.0", # "kernels-benchmark-tools", -# "xformers", # ] # # [tool.uv.sources] @@ -13,18 +12,20 @@ import torch import sys from kernels_benchmark_tools import KernelTypeEnum, run_benchmark -import xformers.ops as xops -def xformers_attention(q, k, v): - """xFormers memory efficient attention""" - # xFormers expects [batch, seq_len, heads, head_dim] - return xops.memory_efficient_attention(q, k, v) +def torch_mem_eff(q, k, v): + qt, kt, vt = (x.transpose(1, 2).contiguous() for x in (q, k, v)) + with torch.nn.attention.sdpa_kernel( + torch.nn.attention.SDPBackend.EFFICIENT_ATTENTION + ): + o = torch.nn.functional.scaled_dot_product_attention(qt, kt, vt) + return o.transpose(1, 2).contiguous() run_benchmark( kernel_type=KernelTypeEnum.ATTENTION, - impl_name="xformers_meff", - impl_tags={"family": "xformers", "backend": "memory_efficient", "compile": "none"}, - impl_func=xformers_attention, + impl_name="torch_mem_eff", + impl_tags={"family": "torch-sdpa", "backend": "EFFICIENT", "compile": "none"}, + impl_func=torch_mem_eff, ) \ No newline at end of file diff --git a/flash_attn/impls/flash_attention.html b/flash_attn/impls/flash_attention.html index a6e50f4eba46389d1f17c35d67cbb770dc3d8952..e3be71b424e37c32c0c21e61ae09312876a15d8b 100644 --- a/flash_attn/impls/flash_attention.html +++ b/flash_attn/impls/flash_attention.html @@ -809,6 +809,14 @@ .artifact-preview svg { background: transparent; } + /* Invert SVG images in dark mode */ + :root[data-theme="dark"] .artifact-preview img[src$=".svg"] { + filter: invert(0.9) hue-rotate(180deg); + } + /* Keep SVG images readable in monocolor mode */ + :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] { + filter: none; + } /* CSV table styling */ .artifact-csv { margin-top: 1rem; @@ -3871,7 +3879,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: nv | 0.28s +Cell: nv | 0.26s | Raw @@ -3888,7 +3896,7 @@ Cell: nv | 0.28s
-
Wed Oct 29 14:25:53 2025       
+
Wed Oct 29 15:50:02 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -3897,7 +3905,7 @@ Cell: nv | 0.28s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   27C    P8             21W /  350W |       0MiB /  46068MiB |      0%      Default |
+| N/A   29C    P0            165W /  350W |       0MiB /  46068MiB |     61%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3919,9 +3927,9 @@ Cell: nv | 0.28s
 
 ▼ code 
 ▼ output
- ▶ uv-logs
+ ▶ uv-logs
  | 
-Cell: benchmark | 32.77s
+Cell: benchmark | 3.82s
  | 
 
 Raw
@@ -3972,29 +3980,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L128_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.644ms       102.02%       3.644ms       3.644ms             1  
-                                         torch_flash_ma         6.80%     356.846us        47.04%       2.468ms       2.468ms       0.000us         0.00%       3.612ms       3.612ms             1  
-                     aten::scaled_dot_product_attention         0.82%      43.042us         4.47%     234.776us      78.259us       0.000us         0.00%       2.857ms     952.201us             3  
-              aten::_scaled_dot_product_flash_attention         0.56%      29.330us         3.65%     191.734us      63.911us       0.000us         0.00%       2.857ms     952.201us             3  
-                         aten::_flash_attention_forward         0.75%      39.581us         2.59%     135.674us      45.225us       2.857ms        79.97%       2.857ms     952.201us             3  
-void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       2.857ms        79.97%       2.857ms     952.201us             3  
-                                       aten::contiguous         0.27%      14.180us        34.32%       1.801ms     150.051us       0.000us         0.00%     755.680us      62.973us            12  
-                                            aten::clone         0.74%      38.791us        34.04%       1.786ms     148.870us       0.000us         0.00%     755.680us      62.973us            12  
-                                            aten::copy_         1.85%      97.030us        31.43%       1.649ms     137.429us     715.456us        20.03%     755.680us      62.973us            12  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     715.456us        20.03%     715.456us      59.621us            12  
-                                Activity Buffer Request        27.38%       1.437ms        27.38%       1.437ms       1.437ms      40.224us         1.13%      40.224us      40.224us             1  
-                                        aten::transpose         1.47%      77.273us         1.96%     102.714us       4.280us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.48%      25.441us         0.48%      25.441us       1.060us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.70%      36.821us         2.35%     123.326us       8.222us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.93%     101.493us         1.93%     101.493us       4.229us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         2.70%     141.775us         2.70%     141.775us       9.452us       0.000us         0.00%       0.000us       0.000us            15  
-                                    aten::empty_strided         0.35%      18.402us         0.35%      18.402us       6.134us       0.000us         0.00%       0.000us       0.000us             3  
-                                 cudaDeviceGetAttribute         0.05%       2.540us         0.05%       2.540us       0.423us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.17%       8.890us         0.17%       8.890us       2.963us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        52.96%       2.779ms        52.96%       2.779ms       2.779ms       0.000us         0.00%       0.000us       0.000us             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.562ms       101.45%       3.562ms       3.562ms             1  
+                                         torch_flash_ma         6.38%     328.580us        45.84%       2.360ms       2.360ms       0.000us         0.00%       3.551ms       3.551ms             1  
+                     aten::scaled_dot_product_attention         0.79%      40.571us         4.12%     212.315us      70.772us       0.000us         0.00%       2.798ms     932.779us             3  
+              aten::_scaled_dot_product_flash_attention         0.52%      26.642us         3.34%     171.744us      57.248us       0.000us         0.00%       2.798ms     932.779us             3  
+                         aten::_flash_attention_forward         0.74%      37.939us         2.40%     123.383us      41.128us       2.798ms        79.71%       2.798ms     932.779us             3  
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       2.798ms        79.71%       2.798ms     932.779us             3  
+                                       aten::contiguous         0.27%      13.720us        34.12%       1.757ms     146.409us       0.000us         0.00%     752.288us      62.691us            12  
+                                            aten::clone         0.73%      37.449us        33.85%       1.743ms     145.266us       0.000us         0.00%     752.288us      62.691us            12  
+                                            aten::copy_         1.68%      86.484us        31.57%       1.625ms     135.456us     712.095us        20.29%     752.288us      62.691us            12  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     712.095us        20.29%     712.095us      59.341us            12  
+                                Activity Buffer Request        28.00%       1.442ms        28.00%       1.442ms       1.442ms      40.193us         1.14%      40.193us      40.193us             1  
+                                        aten::transpose         1.22%      62.637us         1.64%      84.218us       3.509us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.42%      21.581us         0.42%      21.581us       0.899us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.48%      24.619us         1.97%     101.523us       6.768us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.76%      90.465us         1.76%      90.465us       3.769us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         2.36%     121.521us         2.36%     121.521us       8.101us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.31%      15.721us         0.31%      15.721us       5.240us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.04%       2.280us         0.04%       2.280us       0.380us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.16%       8.181us         0.16%       8.181us       2.727us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        54.16%       2.789ms        54.16%       2.789ms       2.789ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 5.247ms
-Self CUDA time total: 3.572ms
+Self CPU time total: 5.149ms
+Self CUDA time total: 3.510ms
 
 
 
@@ -4004,29 +4012,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L256_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                         torch_flash_ma         4.70%     246.528us        41.73%       2.189ms       2.189ms       0.000us         0.00%       3.817ms       3.817ms             1  
-                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.772ms       100.28%       3.772ms       3.772ms             1  
-                     aten::scaled_dot_product_attention         0.51%      26.610us         3.43%     180.143us      60.048us       0.000us         0.00%       2.999ms     999.573us             3  
-              aten::_scaled_dot_product_flash_attention         0.37%      19.600us         2.93%     153.533us      51.178us       0.000us         0.00%       2.999ms     999.573us             3  
-                         aten::_flash_attention_forward         0.63%      32.980us         2.12%     111.443us      37.148us       2.999ms        79.71%       2.999ms     999.573us             3  
-void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       2.999ms        79.71%       2.999ms     999.573us             3  
-                                       aten::contiguous         0.19%      10.030us        32.68%       1.715ms     142.893us       0.000us         0.00%     818.210us      68.184us            12  
-                                            aten::clone         0.55%      29.002us        32.49%       1.705ms     142.057us       0.000us         0.00%     818.210us      68.184us            12  
-                                            aten::copy_         2.09%     109.441us        30.74%       1.613ms     134.399us     763.297us        20.29%     818.210us      68.184us            12  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     763.297us        20.29%     763.297us      63.608us            12  
-                                Activity Buffer Request        26.94%       1.413ms        26.94%       1.413ms       1.413ms      54.913us         1.46%      54.913us      54.913us             1  
-                                        aten::transpose         1.00%      52.652us         1.34%      70.433us       2.935us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.34%      17.781us         0.34%      17.781us       0.741us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.38%      19.980us         1.61%      84.581us       5.639us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.45%      76.201us         1.45%      76.201us       3.175us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         2.16%     113.102us         2.16%     113.102us       7.540us       0.000us         0.00%       0.000us       0.000us            15  
-                                    aten::empty_strided         0.31%      16.430us         0.31%      16.430us       5.477us       0.000us         0.00%       0.000us       0.000us             3  
-                                 cudaDeviceGetAttribute         0.03%       1.751us         0.03%       1.751us       0.292us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.07%       3.771us         0.07%       3.771us       1.257us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        58.27%       3.058ms        58.27%       3.058ms       3.058ms       0.000us         0.00%       0.000us       0.000us             1  
+                                         torch_flash_ma         4.71%     257.538us        44.52%       2.436ms       2.436ms       0.000us         0.00%       3.763ms       3.763ms             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.719ms       100.29%       3.719ms       3.719ms             1  
+                     aten::scaled_dot_product_attention         0.45%      24.440us         3.30%     180.683us      60.228us       0.000us         0.00%       2.948ms     982.525us             3  
+              aten::_scaled_dot_product_flash_attention         0.35%      18.890us         2.86%     156.243us      52.081us       0.000us         0.00%       2.948ms     982.525us             3  
+                         aten::_flash_attention_forward         0.68%      37.218us         2.07%     113.133us      37.711us       2.948ms        79.49%       2.948ms     982.525us             3  
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       2.948ms        79.49%       2.948ms     982.525us             3  
+                                       aten::contiguous         0.16%       8.651us        35.72%       1.955ms     162.890us       0.000us         0.00%     815.678us      67.973us            12  
+                                            aten::clone         0.48%      26.452us        35.56%       1.946ms     162.169us       0.000us         0.00%     815.678us      67.973us            12  
+                                            aten::copy_         1.81%      99.279us        33.97%       1.859ms     154.885us     760.479us        20.51%     815.678us      67.973us            12  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     760.479us        20.51%     760.479us      63.373us            12  
+                                Activity Buffer Request        30.60%       1.674ms        30.60%       1.674ms       1.674ms      55.199us         1.49%      55.199us      55.199us             1  
+                                        aten::transpose         0.92%      50.270us         1.23%      67.460us       2.811us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.31%      17.190us         0.31%      17.190us       0.716us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.34%      18.723us         1.45%      79.503us       5.300us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.39%      75.933us         1.39%      75.933us       3.164us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         1.98%     108.143us         1.98%     108.143us       7.210us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.25%      13.599us         0.25%      13.599us       4.533us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.03%       1.831us         0.03%       1.831us       0.305us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.07%       3.690us         0.07%       3.690us       1.230us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        55.48%       3.036ms        55.48%       3.036ms       3.036ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 5.247ms
-Self CUDA time total: 3.762ms
+Self CPU time total: 5.472ms
+Self CUDA time total: 3.708ms
 
 
 
@@ -4036,29 +4044,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L320_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                         torch_flash_ma         4.50%     237.986us        41.18%       2.178ms       2.178ms       0.000us         0.00%       3.833ms       3.833ms             1  
-                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.785ms       100.29%       3.785ms       3.785ms             1  
-                     aten::scaled_dot_product_attention         0.46%      24.381us         3.40%     179.915us      59.972us       0.000us         0.00%       2.998ms     999.221us             3  
-              aten::_scaled_dot_product_flash_attention         0.36%      19.171us         2.94%     155.534us      51.845us       0.000us         0.00%       2.998ms     999.221us             3  
-                         aten::_flash_attention_forward         0.65%      34.259us         2.15%     113.691us      37.897us       2.998ms        79.44%       2.998ms     999.221us             3  
-void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       2.998ms        79.44%       2.998ms     999.221us             3  
-                                       aten::contiguous         0.19%       9.800us        32.38%       1.712ms     142.708us       0.000us         0.00%     835.263us      69.605us            12  
-                                            aten::clone         0.53%      28.211us        32.20%       1.703ms     141.891us       0.000us         0.00%     835.263us      69.605us            12  
-                                            aten::copy_         1.60%      84.650us        30.46%       1.611ms     134.247us     776.063us        20.56%     835.263us      69.605us            12  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     776.063us        20.56%     776.063us      64.672us            12  
-                                Activity Buffer Request        27.18%       1.437ms        27.18%       1.437ms       1.437ms      59.200us         1.57%      59.200us      59.200us             1  
-                                        aten::transpose         0.99%      52.225us         1.33%      70.125us       2.922us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.34%      17.900us         0.34%      17.900us       0.746us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.37%      19.782us         1.60%      84.803us       5.654us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.45%      76.431us         1.45%      76.431us       3.185us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         2.16%     114.204us         2.16%     114.204us       7.614us       0.000us         0.00%       0.000us       0.000us            15  
-                                    aten::empty_strided         0.30%      16.100us         0.30%      16.100us       5.367us       0.000us         0.00%       0.000us       0.000us             3  
-                                 cudaDeviceGetAttribute         0.03%       1.730us         0.03%       1.730us       0.288us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.07%       3.730us         0.07%       3.730us       1.243us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        58.82%       3.110ms        58.82%       3.110ms       3.110ms       0.000us         0.00%       0.000us       0.000us             1  
+                                         torch_flash_ma         4.65%     248.558us        40.70%       2.176ms       2.176ms       0.000us         0.00%       3.868ms       3.868ms             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.819ms       100.29%       3.819ms       3.819ms             1  
+                     aten::scaled_dot_product_attention         0.45%      24.181us         3.36%     179.834us      59.945us       0.000us         0.00%       3.027ms       1.009ms             3  
+              aten::_scaled_dot_product_flash_attention         0.34%      18.100us         2.91%     155.653us      51.884us       0.000us         0.00%       3.027ms       1.009ms             3  
+                         aten::_flash_attention_forward         0.73%      38.760us         2.16%     115.412us      38.471us       3.027ms        79.48%       3.027ms       1.009ms             3  
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.027ms        79.48%       3.027ms       1.009ms             3  
+                                       aten::contiguous         0.16%       8.609us        31.88%       1.704ms     142.018us       0.000us         0.00%     841.280us      70.107us            12  
+                                            aten::clone         0.50%      26.820us        31.72%       1.696ms     141.301us       0.000us         0.00%     841.280us      70.107us            12  
+                                            aten::copy_         1.47%      78.703us        30.10%       1.609ms     134.076us     781.631us        20.52%     841.280us      70.107us            12  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     781.631us        20.52%     781.631us      65.136us            12  
+                                Activity Buffer Request        27.11%       1.449ms        27.11%       1.449ms       1.449ms      59.649us         1.57%      59.649us      59.649us             1  
+                                        aten::transpose         0.90%      48.151us         1.22%      65.102us       2.713us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.32%      16.951us         0.32%      16.951us       0.706us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.35%      18.789us         1.49%      79.862us       5.324us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.38%      73.892us         1.38%      73.892us       3.079us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         1.96%     104.680us         1.96%     104.680us       6.979us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.28%      15.081us         0.28%      15.081us       5.027us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.03%       1.791us         0.03%       1.791us       0.299us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.07%       3.500us         0.07%       3.500us       1.167us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        59.30%       3.169ms        59.30%       3.169ms       3.169ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 5.288ms
-Self CUDA time total: 3.774ms
+Self CPU time total: 5.345ms
+Self CUDA time total: 3.808ms
 
 
 
@@ -4068,29 +4076,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L384_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                         torch_flash_ma         4.36%     241.837us        43.33%       2.405ms       2.405ms       0.000us         0.00%       3.884ms       3.884ms             1  
-                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.837ms       100.27%       3.837ms       3.837ms             1  
-                     aten::scaled_dot_product_attention         0.48%      26.802us         3.27%     181.715us      60.572us       0.000us         0.00%       3.042ms       1.014ms             3  
-              aten::_scaled_dot_product_flash_attention         0.35%      19.308us         2.79%     154.913us      51.638us       0.000us         0.00%       3.042ms       1.014ms             3  
-                         aten::_flash_attention_forward         0.60%      33.361us         2.03%     112.712us      37.571us       3.042ms        79.50%       3.042ms       1.014ms             3  
-void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.042ms        79.50%       3.042ms       1.014ms             3  
-                                       aten::contiguous         0.17%       9.659us        34.84%       1.934ms     161.162us       0.000us         0.00%     841.829us      70.152us            12  
-                                            aten::clone         0.50%      27.830us        34.67%       1.924ms     160.357us       0.000us         0.00%     841.829us      70.152us            12  
-                                            aten::copy_         1.56%      86.702us        32.55%       1.807ms     150.547us     784.548us        20.50%     841.829us      70.152us            12  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     784.548us        20.50%     784.548us      65.379us            12  
-                                Activity Buffer Request        25.45%       1.413ms        25.45%       1.413ms       1.413ms      57.281us         1.50%      57.281us      57.281us             1  
-                                        aten::transpose         0.95%      52.620us         1.27%      70.404us       2.933us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.32%      17.784us         0.32%      17.784us       0.741us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.78%      43.221us         2.00%     111.194us       7.413us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.45%      80.673us         1.45%      80.673us       3.361us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         5.96%     331.078us         5.96%     331.078us      22.072us       0.000us         0.00%       0.000us       0.000us            15  
-                                    aten::empty_strided         0.28%      15.800us         0.28%      15.800us       5.267us       0.000us         0.00%       0.000us       0.000us             3  
-                                 cudaDeviceGetAttribute         0.03%       1.730us         0.03%       1.730us       0.288us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.07%       3.850us         0.07%       3.850us       1.283us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        56.67%       3.146ms        56.67%       3.146ms       3.146ms       0.000us         0.00%       0.000us       0.000us             1  
+                                         torch_flash_ma         4.50%     255.237us        42.25%       2.398ms       2.398ms       0.000us         0.00%       3.984ms       3.984ms             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.936ms       100.28%       3.936ms       3.936ms             1  
+                     aten::scaled_dot_product_attention         0.42%      23.840us         3.17%     179.904us      59.968us       0.000us         0.00%       3.135ms       1.045ms             3  
+              aten::_scaled_dot_product_flash_attention         0.36%      20.442us         2.75%     156.064us      52.021us       0.000us         0.00%       3.135ms       1.045ms             3  
+                         aten::_flash_attention_forward         0.68%      38.721us         1.99%     113.183us      37.728us       3.135ms        79.87%       3.135ms       1.045ms             3  
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.135ms        79.87%       3.135ms       1.045ms             3  
+                                       aten::contiguous         0.17%       9.382us        33.81%       1.919ms     159.915us       0.000us         0.00%     848.416us      70.701us            12  
+                                            aten::clone         0.52%      29.639us        33.64%       1.910ms     159.133us       0.000us         0.00%     848.416us      70.701us            12  
+                                            aten::copy_         1.40%      79.644us        32.03%       1.818ms     151.492us     790.048us        20.13%     848.416us      70.701us            12  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     790.048us        20.13%     790.048us      65.837us            12  
+                                Activity Buffer Request        25.14%       1.427ms        25.14%       1.427ms       1.427ms      58.368us         1.49%      58.368us      58.368us             1  
+                                        aten::transpose         0.87%      49.289us         1.17%      66.169us       2.757us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.30%      16.880us         0.30%      16.880us       0.703us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.35%      19.852us         1.42%      80.662us       5.377us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.32%      74.981us         1.32%      74.981us       3.124us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         5.89%     334.125us         5.89%     334.125us      22.275us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.24%      13.720us         0.24%      13.720us       4.573us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.03%       1.760us         0.03%       1.760us       0.293us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.06%       3.570us         0.06%       3.570us       1.190us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        57.75%       3.278ms        57.75%       3.278ms       3.278ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 5.551ms
-Self CUDA time total: 3.827ms
+Self CPU time total: 5.676ms
+Self CUDA time total: 3.925ms
 
 
 
@@ -4100,29 +4108,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L448_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                         torch_flash_ma         4.46%     268.165us        40.09%       2.413ms       2.413ms       0.000us         0.00%       4.405ms       4.405ms             1  
-                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       4.355ms       100.25%       4.355ms       4.355ms             1  
-                     aten::scaled_dot_product_attention         0.46%      27.642us         3.64%     218.806us      72.935us       0.000us         0.00%       3.540ms       1.180ms             3  
-              aten::_scaled_dot_product_flash_attention         0.75%      45.250us         3.18%     191.164us      63.721us       0.000us         0.00%       3.540ms       1.180ms             3  
-                         aten::_flash_attention_forward         0.61%      36.651us         2.01%     120.923us      40.308us       3.540ms        81.48%       3.540ms       1.180ms             3  
-void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.540ms        81.48%       3.540ms       1.180ms             3  
-                                       aten::contiguous         0.18%      10.862us        31.11%       1.873ms     156.050us       0.000us         0.00%     865.606us      72.134us            12  
-                                            aten::clone         0.51%      30.490us        30.93%       1.862ms     155.145us       0.000us         0.00%     865.606us      72.134us            12  
-                                            aten::copy_         1.51%      90.931us        29.34%       1.766ms     147.155us     804.645us        18.52%     865.606us      72.134us            12  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     804.645us        18.52%     804.645us      67.054us            12  
-                                Activity Buffer Request        21.61%       1.300ms        21.61%       1.300ms       1.300ms      60.961us         1.40%      60.961us      60.961us             1  
-                                        aten::transpose         0.99%      59.753us         1.30%      78.501us       3.271us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.31%      18.748us         0.31%      18.748us       0.781us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.35%      20.935us         1.45%      87.165us       5.811us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.32%      79.690us         1.32%      79.690us       3.320us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         6.67%     401.680us         6.67%     401.680us      26.779us       0.000us         0.00%       0.000us       0.000us            15  
-                                    aten::empty_strided         0.27%      16.081us         0.27%      16.081us       5.360us       0.000us         0.00%       0.000us       0.000us             3  
-                                 cudaDeviceGetAttribute         0.03%       2.030us         0.03%       2.030us       0.338us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.06%       3.810us         0.06%       3.810us       1.270us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        59.91%       3.605ms        59.91%       3.605ms       3.605ms       0.000us         0.00%       0.000us       0.000us             1  
+                                         torch_flash_ma         5.07%     311.056us        40.82%       2.505ms       2.505ms       0.000us         0.00%       4.409ms       4.409ms             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       4.359ms       100.26%       4.359ms       4.359ms             1  
+                     aten::scaled_dot_product_attention         0.41%      24.931us         3.07%     188.265us      62.755us       0.000us         0.00%       3.539ms       1.180ms             3  
+              aten::_scaled_dot_product_flash_attention         0.33%      20.199us         2.66%     163.334us      54.445us       0.000us         0.00%       3.539ms       1.180ms             3  
+                         aten::_flash_attention_forward         0.67%      41.371us         1.94%     118.823us      39.608us       3.539ms        81.38%       3.539ms       1.180ms             3  
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.539ms        81.38%       3.539ms       1.180ms             3  
+                                       aten::contiguous         0.16%       9.771us        31.97%       1.962ms     163.526us       0.000us         0.00%     870.819us      72.568us            12  
+                                            aten::clone         0.47%      28.779us        31.82%       1.953ms     162.712us       0.000us         0.00%     870.819us      72.568us            12  
+                                            aten::copy_         1.27%      77.896us        30.33%       1.862ms     155.132us     809.571us        18.62%     870.819us      72.568us            12  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     809.571us        18.62%     809.571us      67.464us            12  
+                                Activity Buffer Request        24.14%       1.481ms        24.14%       1.481ms       1.481ms      61.248us         1.41%      61.248us      61.248us             1  
+                                        aten::transpose         0.82%      50.583us         1.11%      68.092us       2.837us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.29%      17.509us         0.29%      17.509us       0.730us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.32%      19.913us         1.33%      81.883us       5.459us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.23%      75.660us         1.23%      75.660us       3.153us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         5.31%     325.825us         5.31%     325.825us      21.722us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.24%      14.770us         0.24%      14.770us       4.923us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.03%       1.990us         0.03%       1.990us       0.332us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.06%       3.670us         0.06%       3.670us       1.223us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        59.18%       3.632ms        59.18%       3.632ms       3.632ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 6.018ms
-Self CUDA time total: 4.344ms
+Self CPU time total: 6.137ms
+Self CUDA time total: 4.348ms
 
 
 
@@ -4132,91 +4140,39 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L512_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                         torch_flash_ma         4.01%     246.839us        39.75%       2.447ms       2.447ms       0.000us         0.00%       4.458ms       4.458ms             1  
-                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       4.407ms       100.23%       4.407ms       4.407ms             1  
-                     aten::scaled_dot_product_attention         0.40%      24.621us         2.95%     181.474us      60.491us       0.000us         0.00%       3.579ms       1.193ms             3  
-              aten::_scaled_dot_product_flash_attention         0.34%      20.980us         2.55%     156.853us      52.284us       0.000us         0.00%       3.579ms       1.193ms             3  
-                         aten::_flash_attention_forward         0.58%      35.588us         1.84%     113.003us      37.668us       3.579ms        81.40%       3.579ms       1.193ms             3  
-void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.579ms        81.40%       3.579ms       1.193ms             3  
-                                       aten::contiguous         0.16%      10.061us        32.01%       1.971ms     164.244us       0.000us         0.00%     878.818us      73.235us            12  
-                                            aten::clone         0.50%      30.903us        31.85%       1.961ms     163.406us       0.000us         0.00%     878.818us      73.235us            12  
-                                            aten::copy_         1.35%      82.841us        30.27%       1.864ms     155.305us     817.634us        18.60%     878.818us      73.235us            12  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     817.634us        18.60%     817.634us      68.136us            12  
-                                Activity Buffer Request        23.50%       1.447ms        23.50%       1.447ms       1.447ms      61.184us         1.39%      61.184us      61.184us             1  
-                                        aten::transpose         0.85%      52.630us         1.15%      70.790us       2.950us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.29%      18.160us         0.29%      18.160us       0.757us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.33%      20.456us         1.41%      86.700us       5.780us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.28%      78.794us         1.28%      78.794us       3.283us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         5.81%     357.919us         5.81%     357.919us      23.861us       0.000us         0.00%       0.000us       0.000us            15  
-                                    aten::empty_strided         0.25%      15.401us         0.25%      15.401us       5.134us       0.000us         0.00%       0.000us       0.000us             3  
-                                 cudaDeviceGetAttribute         0.03%       1.632us         0.03%       1.632us       0.272us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.06%       3.720us         0.06%       3.720us       1.240us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        60.25%       3.709ms        60.25%       3.709ms       3.709ms       0.000us         0.00%       0.000us       0.000us             1  
+                                         torch_flash_ma         4.13%     252.675us        38.98%       2.384ms       2.384ms       0.000us         0.00%       4.451ms       4.451ms             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       4.400ms       100.24%       4.400ms       4.400ms             1  
+                     aten::scaled_dot_product_attention         0.50%      30.480us         3.11%     190.334us      63.445us       0.000us         0.00%       3.566ms       1.189ms             3  
+              aten::_scaled_dot_product_flash_attention         0.31%      19.082us         2.61%     159.854us      53.285us       0.000us         0.00%       3.566ms       1.189ms             3  
+                         aten::_flash_attention_forward         0.62%      38.112us         1.93%     118.053us      39.351us       3.566ms        81.24%       3.566ms       1.189ms             3  
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.566ms        81.24%       3.566ms       1.189ms             3  
+                                       aten::contiguous         0.16%       9.891us        31.02%       1.897ms     158.059us       0.000us         0.00%     884.831us      73.736us            12  
+                                            aten::clone         0.50%      30.290us        30.85%       1.887ms     157.234us       0.000us         0.00%     884.831us      73.736us            12  
+                                            aten::copy_         1.28%      78.520us        29.35%       1.795ms     149.550us     823.711us        18.76%     884.831us      73.736us            12  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     823.711us        18.76%     823.711us      68.643us            12  
+                                Activity Buffer Request        23.29%       1.424ms        23.29%       1.424ms       1.424ms      61.120us         1.39%      61.120us      61.120us             1  
+                                        aten::transpose         0.81%      49.593us         1.09%      66.721us       2.780us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.28%      17.128us         0.28%      17.128us       0.714us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.33%      20.381us         1.35%      82.362us       5.491us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.23%      74.920us         1.23%      74.920us       3.122us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         5.19%     317.558us         5.19%     317.558us      21.171us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.25%      15.161us         0.25%      15.161us       5.054us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.03%       1.791us         0.03%       1.791us       0.299us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.06%       3.670us         0.06%       3.670us       1.223us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        61.02%       3.732ms        61.02%       3.732ms       3.732ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 6.156ms
-Self CUDA time total: 4.397ms
+Self CPU time total: 6.115ms
+Self CUDA time total: 4.390ms
 
 
 impl                     wl                  p50(ms)  ok
-torch_flash_ma           cuda_attn_L128_bfloat16     1.22  True
-torch_flash_ma           cuda_attn_L256_bfloat16     1.28  True
-torch_flash_ma           cuda_attn_L320_bfloat16     1.29  True
-torch_flash_ma           cuda_attn_L384_bfloat16     1.33  True
-torch_flash_ma           cuda_attn_L448_bfloat16     1.47  True
-torch_flash_ma           cuda_attn_L512_bfloat16     1.50  True
+torch_flash_ma           cuda_attn_L128_bfloat16     1.21  True
+torch_flash_ma           cuda_attn_L256_bfloat16     1.27  True
+torch_flash_ma           cuda_attn_L320_bfloat16     1.30  True
+torch_flash_ma           cuda_attn_L384_bfloat16     1.32  True
+torch_flash_ma           cuda_attn_L448_bfloat16     1.48  True
+torch_flash_ma           cuda_attn_L512_bfloat16     1.49  True
 
-
-
▶ UV Install Logs
- -

Artifacts:

attention.jsonl diff --git a/flash_attn/impls/hf_kernels_flash_attn.html b/flash_attn/impls/hf_kernels_flash_attn.html index 7d03567858952d02de89e25ce04873ef34373a75..e3b14083fe3f4abc66afe6736903f326bb673642 100644 --- a/flash_attn/impls/hf_kernels_flash_attn.html +++ b/flash_attn/impls/hf_kernels_flash_attn.html @@ -809,6 +809,14 @@ .artifact-preview svg { background: transparent; } + /* Invert SVG images in dark mode */ + :root[data-theme="dark"] .artifact-preview img[src$=".svg"] { + filter: invert(0.9) hue-rotate(180deg); + } + /* Keep SVG images readable in monocolor mode */ + :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] { + filter: none; + } /* CSV table styling */ .artifact-csv { margin-top: 1rem; @@ -3871,7 +3879,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: benchmark | 5.58s +Cell: benchmark | 5.46s | Raw @@ -3926,21 +3934,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L128_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn 3.55% 156.153us 41.08% 1.807ms 1.807ms 0.000us 0.00% 3.775ms 3.775ms 1 - _flash_attn_9e27194::fwd 1.65% 72.542us 37.53% 1.651ms 550.240us 2.812ms 100.00% 3.775ms 1.258ms 3 - hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.814ms 100.05% 2.814ms 2.814ms 1 -void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.812ms 100.00% 2.812ms 937.398us 3 - Activity Buffer Request 32.22% 1.417ms 32.22% 1.417ms 1.417ms 962.880us 34.24% 962.880us 962.880us 1 - cudaDeviceGetAttribute 0.13% 5.500us 0.13% 5.500us 0.367us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty_like 0.43% 19.110us 1.25% 54.882us 18.294us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 0.81% 35.772us 0.81% 35.772us 11.924us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty 0.57% 25.101us 0.57% 25.101us 2.789us 0.000us 0.00% 0.000us 0.000us 9 - cudaFuncSetAttribute 0.30% 13.270us 0.30% 13.270us 4.423us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 1.42% 62.402us 1.42% 62.402us 20.801us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 58.92% 2.591ms 58.92% 2.591ms 2.591ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn 3.61% 157.413us 41.18% 1.795ms 1.795ms 0.000us 0.00% 3.726ms 3.726ms 1 + _flash_attn_9e27194::fwd 1.61% 70.165us 37.57% 1.638ms 545.853us 2.781ms 100.00% 3.726ms 1.242ms 3 + hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.783ms 100.05% 2.783ms 2.783ms 1 +void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.781ms 100.00% 2.781ms 927.059us 3 + Activity Buffer Request 32.93% 1.435ms 32.93% 1.435ms 1.435ms 944.349us 33.96% 944.349us 944.349us 1 + cudaDeviceGetAttribute 0.11% 4.789us 0.11% 4.789us 0.319us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty_like 0.38% 16.590us 1.18% 51.251us 17.084us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 0.80% 34.661us 0.80% 34.661us 11.554us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty 0.57% 24.950us 0.57% 24.950us 2.772us 0.000us 0.00% 0.000us 0.000us 9 + cudaFuncSetAttribute 0.27% 11.579us 0.27% 11.579us 3.860us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.90% 39.431us 0.90% 39.431us 13.144us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 58.82% 2.564ms 58.82% 2.564ms 2.564ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.398ms -Self CUDA time total: 2.812ms +Self CPU time total: 4.359ms +Self CUDA time total: 2.781ms @@ -3950,21 +3958,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L256_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn 2.04% 91.192us 36.62% 1.634ms 1.634ms 0.000us 0.00% 3.983ms 3.983ms 1 - _flash_attn_9e27194::fwd 1.11% 49.718us 34.57% 1.543ms 514.203us 2.978ms 100.00% 3.983ms 1.328ms 3 - hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.980ms 100.05% 2.980ms 2.980ms 1 -void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.978ms 100.00% 2.978ms 992.707us 3 - Activity Buffer Request 31.74% 1.416ms 31.74% 1.416ms 1.416ms 1.004ms 33.73% 1.004ms 1.004ms 1 - cudaDeviceGetAttribute 0.08% 3.711us 0.08% 3.711us 0.247us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty_like 0.17% 7.481us 0.51% 22.841us 7.614us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 0.34% 15.360us 0.34% 15.360us 5.120us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty 0.46% 20.620us 0.46% 20.620us 2.291us 0.000us 0.00% 0.000us 0.000us 9 - cudaFuncSetAttribute 0.08% 3.741us 0.08% 3.741us 1.247us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 0.58% 25.842us 0.58% 25.842us 8.614us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 63.38% 2.828ms 63.38% 2.828ms 2.828ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn 1.92% 86.861us 37.15% 1.685ms 1.685ms 0.000us 0.00% 3.967ms 3.967ms 1 + _flash_attn_9e27194::fwd 1.05% 47.633us 35.24% 1.598ms 532.729us 2.988ms 100.00% 3.967ms 1.322ms 3 + hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.989ms 100.05% 2.989ms 2.989ms 1 +void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.988ms 100.00% 2.988ms 995.953us 3 + Activity Buffer Request 32.54% 1.476ms 32.54% 1.476ms 1.476ms 979.196us 32.77% 979.196us 979.196us 1 + cudaDeviceGetAttribute 0.08% 3.549us 0.08% 3.549us 0.237us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty_like 0.15% 6.770us 0.48% 21.750us 7.250us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 0.33% 14.980us 0.33% 14.980us 4.993us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty 0.45% 20.562us 0.45% 20.562us 2.285us 0.000us 0.00% 0.000us 0.000us 9 + cudaFuncSetAttribute 0.08% 3.410us 0.08% 3.410us 1.137us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.56% 25.521us 0.56% 25.521us 8.507us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 62.85% 2.850ms 62.85% 2.850ms 2.850ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.462ms -Self CUDA time total: 2.978ms +Self CPU time total: 4.535ms +Self CUDA time total: 2.988ms @@ -3974,21 +3982,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L320_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn 2.28% 105.284us 36.17% 1.673ms 1.673ms 0.000us 0.00% 4.145ms 4.145ms 1 - _flash_attn_9e27194::fwd 1.09% 50.271us 33.89% 1.567ms 522.459us 3.096ms 100.00% 4.145ms 1.382ms 3 - hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.098ms 100.05% 3.098ms 3.098ms 1 -void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.096ms 100.00% 3.096ms 1.032ms 3 - Activity Buffer Request 31.08% 1.437ms 31.08% 1.437ms 1.437ms 1.049ms 33.87% 1.049ms 1.049ms 1 - cudaDeviceGetAttribute 0.08% 3.850us 0.08% 3.850us 0.257us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty_like 0.15% 7.061us 0.49% 22.631us 7.544us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 0.34% 15.570us 0.34% 15.570us 5.190us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty 0.47% 21.760us 0.47% 21.760us 2.418us 0.000us 0.00% 0.000us 0.000us 9 - cudaFuncSetAttribute 0.08% 3.689us 0.08% 3.689us 1.230us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 0.61% 27.992us 0.61% 27.992us 9.331us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 63.83% 2.952ms 63.83% 2.952ms 2.952ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn 2.25% 102.643us 36.16% 1.652ms 1.652ms 0.000us 0.00% 4.081ms 4.081ms 1 + _flash_attn_9e27194::fwd 1.10% 50.081us 33.92% 1.550ms 516.605us 3.056ms 100.00% 4.081ms 1.360ms 3 + hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.058ms 100.05% 3.058ms 3.058ms 1 +void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.056ms 100.00% 3.056ms 1.019ms 3 + Activity Buffer Request 31.13% 1.423ms 31.13% 1.423ms 1.423ms 1.024ms 33.52% 1.024ms 1.024ms 1 + cudaDeviceGetAttribute 0.08% 3.832us 0.08% 3.832us 0.255us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty_like 0.15% 6.971us 0.48% 22.109us 7.370us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 0.33% 15.138us 0.33% 15.138us 5.046us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty 0.46% 20.860us 0.46% 20.860us 2.318us 0.000us 0.00% 0.000us 0.000us 9 + cudaFuncSetAttribute 0.08% 3.430us 0.08% 3.430us 1.143us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.59% 26.891us 0.59% 26.891us 8.964us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 63.84% 2.917ms 63.84% 2.917ms 2.917ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.625ms -Self CUDA time total: 3.096ms +Self CPU time total: 4.569ms +Self CUDA time total: 3.056ms @@ -3998,21 +4006,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L384_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn 2.30% 110.882us 38.29% 1.842ms 1.842ms 0.000us 0.00% 4.161ms 4.161ms 1 - _flash_attn_9e27194::fwd 1.05% 50.321us 35.98% 1.731ms 577.014us 3.117ms 100.00% 4.161ms 1.387ms 3 - hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.118ms 100.05% 3.118ms 3.118ms 1 -void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.117ms 100.00% 3.117ms 1.039ms 3 - Activity Buffer Request 29.64% 1.426ms 29.64% 1.426ms 1.426ms 1.044ms 33.50% 1.044ms 1.044ms 1 - cudaDeviceGetAttribute 0.08% 3.780us 0.08% 3.780us 0.252us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty_like 0.15% 7.259us 0.50% 24.240us 8.080us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 0.35% 16.981us 0.35% 16.981us 5.660us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty 0.45% 21.602us 0.45% 21.602us 2.400us 0.000us 0.00% 0.000us 0.000us 9 - cudaFuncSetAttribute 0.08% 3.770us 0.08% 3.770us 1.257us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 4.18% 201.205us 4.18% 201.205us 67.068us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 61.71% 2.969ms 61.71% 2.969ms 2.969ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn 2.25% 106.084us 38.22% 1.803ms 1.803ms 0.000us 0.00% 4.091ms 4.091ms 1 + _flash_attn_9e27194::fwd 1.01% 47.791us 35.97% 1.697ms 565.799us 3.060ms 100.00% 4.091ms 1.364ms 3 + hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.062ms 100.06% 3.062ms 3.062ms 1 +void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.060ms 100.00% 3.060ms 1.020ms 3 + Activity Buffer Request 30.05% 1.418ms 30.05% 1.418ms 1.418ms 1.031ms 33.68% 1.031ms 1.031ms 1 + cudaDeviceGetAttribute 0.08% 3.720us 0.08% 3.720us 0.248us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty_like 0.16% 7.600us 0.52% 24.620us 8.207us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 0.36% 17.020us 0.36% 17.020us 5.673us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty 0.44% 20.780us 0.44% 20.780us 2.309us 0.000us 0.00% 0.000us 0.000us 9 + cudaFuncSetAttribute 0.08% 3.620us 0.08% 3.620us 1.207us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 3.79% 178.824us 3.79% 178.824us 59.608us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 61.78% 2.916ms 61.78% 2.916ms 2.916ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.811ms -Self CUDA time total: 3.117ms +Self CPU time total: 4.719ms +Self CUDA time total: 3.060ms @@ -4022,21 +4030,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L448_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn 2.05% 108.443us 34.64% 1.832ms 1.832ms 0.000us 0.00% 4.810ms 4.810ms 1 - _flash_attn_9e27194::fwd 0.96% 50.812us 32.59% 1.723ms 574.364us 3.602ms 100.00% 4.810ms 1.603ms 3 - hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.603ms 100.04% 3.603ms 3.603ms 1 -void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.602ms 100.00% 3.602ms 1.201ms 3 - Activity Buffer Request 27.53% 1.455ms 27.53% 1.455ms 1.455ms 1.209ms 33.55% 1.209ms 1.209ms 1 - cudaDeviceGetAttribute 0.08% 4.070us 0.08% 4.070us 0.271us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty_like 0.14% 7.390us 0.45% 23.900us 7.967us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 0.31% 16.510us 0.31% 16.510us 5.503us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty 0.40% 21.151us 0.40% 21.151us 2.350us 0.000us 0.00% 0.000us 0.000us 9 - cudaFuncSetAttribute 0.07% 3.770us 0.07% 3.770us 1.257us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 3.10% 164.023us 3.10% 164.023us 54.674us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 65.36% 3.455ms 65.36% 3.455ms 3.455ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn 2.06% 106.072us 34.88% 1.800ms 1.800ms 0.000us 0.00% 4.679ms 4.679ms 1 + _flash_attn_9e27194::fwd 0.97% 50.192us 32.82% 1.694ms 564.573us 3.505ms 100.00% 4.679ms 1.560ms 3 + hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.507ms 100.04% 3.507ms 3.507ms 1 +void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.505ms 100.00% 3.505ms 1.168ms 3 + Activity Buffer Request 27.53% 1.421ms 27.53% 1.421ms 1.421ms 1.174ms 33.50% 1.174ms 1.174ms 1 + cudaDeviceGetAttribute 0.08% 4.219us 0.08% 4.219us 0.281us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty_like 0.15% 7.700us 0.46% 23.940us 7.980us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 0.31% 16.240us 0.31% 16.240us 5.413us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty 0.41% 21.049us 0.41% 21.049us 2.339us 0.000us 0.00% 0.000us 0.000us 9 + cudaFuncSetAttribute 0.07% 3.601us 0.07% 3.601us 1.200us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 3.29% 169.975us 3.29% 169.975us 56.658us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 65.12% 3.360ms 65.12% 3.360ms 3.360ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 5.287ms -Self CUDA time total: 3.602ms +Self CPU time total: 5.160ms +Self CUDA time total: 3.505ms @@ -4046,35 +4054,36 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L512_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn 1.95% 105.103us 34.11% 1.836ms 1.836ms 0.000us 0.00% 4.931ms 4.931ms 1 - _flash_attn_9e27194::fwd 1.08% 58.141us 32.16% 1.731ms 577.087us 3.693ms 100.00% 4.931ms 1.644ms 3 - hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.695ms 100.04% 3.695ms 3.695ms 1 -void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.693ms 100.00% 3.693ms 1.231ms 3 - Activity Buffer Request 26.71% 1.438ms 26.71% 1.438ms 1.438ms 1.238ms 33.53% 1.238ms 1.238ms 1 - cudaDeviceGetAttribute 0.08% 4.380us 0.08% 4.380us 0.292us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty_like 0.15% 8.230us 0.50% 26.750us 8.917us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 0.34% 18.520us 0.34% 18.520us 6.173us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty 0.48% 25.961us 0.48% 25.961us 2.885us 0.000us 0.00% 0.000us 0.000us 9 - cudaFuncSetAttribute 0.08% 4.220us 0.08% 4.220us 1.407us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 3.23% 173.714us 3.23% 173.714us 57.905us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 65.89% 3.548ms 65.89% 3.548ms 3.548ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn 2.05% 108.192us 34.34% 1.815ms 1.815ms 0.000us 0.00% 4.838ms 4.838ms 1 + _flash_attn_9e27194::fwd 0.96% 50.903us 32.30% 1.707ms 568.907us 3.618ms 100.00% 4.838ms 1.613ms 3 + hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.620ms 100.04% 3.620ms 3.620ms 1 +void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.618ms 100.00% 3.618ms 1.206ms 3 + Activity Buffer Request 26.73% 1.413ms 26.73% 1.413ms 1.413ms 1.220ms 33.72% 1.220ms 1.220ms 1 + cudaDeviceGetAttribute 0.07% 3.869us 0.07% 3.869us 0.258us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty_like 0.14% 7.319us 0.48% 25.360us 8.453us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 0.34% 18.041us 0.34% 18.041us 6.014us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty 0.41% 21.680us 0.41% 21.680us 2.409us 0.000us 0.00% 0.000us 0.000us 9 + cudaFuncSetAttribute 0.07% 3.810us 0.07% 3.810us 1.270us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 3.57% 188.496us 3.57% 188.496us 62.832us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 65.66% 3.470ms 65.66% 3.470ms 3.470ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 5.384ms -Self CUDA time total: 3.693ms +Self CPU time total: 5.285ms +Self CUDA time total: 3.618ms impl wl p50(ms) ok -hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.97 True -hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.01 True -hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.06 True -hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.09 True -hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.24 True -hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.23 True +hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.95 True +hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.00 True +hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.05 True +hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.07 True +hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.21 True +hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.19 True
Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s] -Fetching 20 files: 10%|█ | 2/20 [00:01<00:13, 1.34it/s] -Fetching 20 files: 100%|██████████| 20/20 [00:01<00:00, 13.40it/s] +Fetching 20 files: 5%|▌ | 1/20 [00:00<00:03, 6.07it/s] +Fetching 20 files: 10%|█ | 2/20 [00:01<00:12, 1.40it/s] +Fetching 20 files: 100%|██████████| 20/20 [00:01<00:00, 15.82it/s]

Artifacts:

diff --git a/flash_attn/impls/hf_kernels_flash_attn3.html b/flash_attn/impls/hf_kernels_flash_attn3.html index 889bda3eb9ecfa28e1bd79f67d85d1acc88d58a0..91636a92fe37a07aae11ba5da111f83413352088 100644 --- a/flash_attn/impls/hf_kernels_flash_attn3.html +++ b/flash_attn/impls/hf_kernels_flash_attn3.html @@ -809,6 +809,14 @@ .artifact-preview svg { background: transparent; } + /* Invert SVG images in dark mode */ + :root[data-theme="dark"] .artifact-preview img[src$=".svg"] { + filter: invert(0.9) hue-rotate(180deg); + } + /* Keep SVG images readable in monocolor mode */ + :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] { + filter: none; + } /* CSV table styling */ .artifact-csv { margin-top: 1rem; @@ -3869,9 +3877,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ code ▼ output - ▶ uv-logs + ▶ uv-logs | -Cell: benchmark | 5.52s +Cell: benchmark | 5.78s | Raw @@ -3925,19 +3933,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L128_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn3 3.72% 161.222us 44.67% 1.935ms 1.935ms 0.000us 0.00% 3.599ms 3.599ms 1 - FlashAttnFunc 2.81% 121.834us 40.95% 1.774ms 591.218us 0.000us 0.00% 3.599ms 1.200ms 3 - _flash_attn3_48fe103_dirty::fwd 1.85% 79.992us 38.14% 1.652ms 550.607us 2.693ms 100.00% 3.599ms 1.200ms 3 - hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.695ms 100.05% 2.695ms 2.695ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.693ms 100.00% 2.693ms 897.759us 3 - Activity Buffer Request 33.93% 1.470ms 33.93% 1.470ms 1.470ms 905.439us 33.62% 905.439us 905.439us 1 - aten::empty 1.00% 43.311us 1.00% 43.311us 7.219us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.32% 13.891us 0.32% 13.891us 4.630us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 1.04% 45.121us 1.04% 45.121us 15.040us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 55.33% 2.396ms 55.33% 2.396ms 2.396ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn3 3.71% 164.893us 43.76% 1.944ms 1.944ms 0.000us 0.00% 3.688ms 3.688ms 1 + FlashAttnFunc 2.67% 118.403us 40.05% 1.779ms 593.141us 0.000us 0.00% 3.688ms 1.229ms 3 + _flash_attn3_48fe103_dirty::fwd 1.75% 77.922us 37.39% 1.661ms 553.673us 2.790ms 100.00% 3.688ms 1.229ms 3 + hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.791ms 100.05% 2.791ms 2.791ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.790ms 100.00% 2.790ms 929.856us 3 + Activity Buffer Request 33.30% 1.480ms 33.30% 1.480ms 1.480ms 898.016us 32.19% 898.016us 898.016us 1 + aten::empty 1.01% 44.942us 1.01% 44.942us 7.490us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.31% 13.870us 0.31% 13.870us 4.623us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 1.01% 44.741us 1.01% 44.741us 14.914us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 56.24% 2.499ms 56.24% 2.499ms 2.499ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.331ms -Self CUDA time total: 2.693ms +Self CPU time total: 4.443ms +Self CUDA time total: 2.790ms @@ -3947,19 +3955,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L256_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn3 2.17% 96.772us 39.76% 1.770ms 1.770ms 0.000us 0.00% 3.876ms 3.876ms 1 - FlashAttnFunc 2.04% 90.694us 37.59% 1.674ms 557.834us 0.000us 0.00% 3.876ms 1.292ms 3 - _flash_attn3_48fe103_dirty::fwd 1.15% 51.142us 35.55% 1.583ms 527.603us 2.896ms 100.00% 3.876ms 1.292ms 3 - hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.898ms 100.05% 2.898ms 2.898ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.896ms 100.00% 2.896ms 965.387us 3 - Activity Buffer Request 33.04% 1.471ms 33.04% 1.471ms 1.471ms 979.809us 33.83% 979.809us 979.809us 1 - aten::empty 0.58% 25.610us 0.58% 25.610us 4.268us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.12% 5.240us 0.12% 5.240us 1.747us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 0.67% 29.750us 0.67% 29.750us 9.917us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 60.24% 2.682ms 60.24% 2.682ms 2.682ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn3 2.31% 100.671us 40.75% 1.773ms 1.773ms 0.000us 0.00% 3.735ms 3.735ms 1 + FlashAttnFunc 2.09% 91.144us 38.44% 1.673ms 557.547us 0.000us 0.00% 3.735ms 1.245ms 3 + _flash_attn3_48fe103_dirty::fwd 1.16% 50.371us 36.34% 1.581ms 527.165us 2.796ms 100.00% 3.735ms 1.245ms 3 + hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.798ms 100.06% 2.798ms 2.798ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.796ms 100.00% 2.796ms 932.000us 3 + Activity Buffer Request 33.75% 1.469ms 33.75% 1.469ms 1.469ms 939.487us 33.60% 939.487us 939.487us 1 + aten::empty 0.64% 27.720us 0.64% 27.720us 4.620us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.11% 4.991us 0.11% 4.991us 1.664us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.68% 29.510us 0.68% 29.510us 9.837us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 59.25% 2.578ms 59.25% 2.578ms 2.578ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.452ms -Self CUDA time total: 2.896ms +Self CPU time total: 4.352ms +Self CUDA time total: 2.796ms @@ -3969,19 +3977,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L320_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn3 2.19% 98.331us 39.82% 1.786ms 1.786ms 0.000us 0.00% 3.885ms 3.885ms 1 - FlashAttnFunc 1.99% 89.333us 37.63% 1.688ms 562.551us 0.000us 0.00% 3.885ms 1.295ms 3 - _flash_attn3_48fe103_dirty::fwd 1.08% 48.311us 35.64% 1.598ms 532.773us 2.912ms 100.00% 3.885ms 1.295ms 3 - hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.914ms 100.05% 2.914ms 2.914ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.912ms 100.00% 2.912ms 970.802us 3 - Activity Buffer Request 33.18% 1.488ms 33.18% 1.488ms 1.488ms 972.637us 33.40% 972.637us 972.637us 1 - aten::empty 0.57% 25.370us 0.57% 25.370us 4.228us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.13% 5.730us 0.13% 5.730us 1.910us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 0.69% 30.861us 0.69% 30.861us 10.287us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 60.18% 2.699ms 60.18% 2.699ms 2.699ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn3 2.10% 95.451us 39.98% 1.817ms 1.817ms 0.000us 0.00% 3.967ms 3.967ms 1 + FlashAttnFunc 2.52% 114.605us 37.88% 1.721ms 573.824us 0.000us 0.00% 3.967ms 1.322ms 3 + _flash_attn3_48fe103_dirty::fwd 1.12% 50.981us 35.36% 1.607ms 535.622us 2.964ms 100.00% 3.967ms 1.322ms 3 + hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.966ms 100.05% 2.966ms 2.966ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.964ms 100.00% 2.964ms 988.118us 3 + Activity Buffer Request 32.83% 1.492ms 32.83% 1.492ms 1.492ms 1.002ms 33.81% 1.002ms 1.002ms 1 + aten::empty 0.60% 27.089us 0.60% 27.089us 4.515us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.12% 5.480us 0.12% 5.480us 1.827us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.69% 31.551us 0.69% 31.551us 10.517us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 60.02% 2.727ms 60.02% 2.727ms 2.727ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.485ms -Self CUDA time total: 2.912ms +Self CPU time total: 4.544ms +Self CUDA time total: 2.964ms @@ -3991,19 +3999,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L384_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn3 2.51% 118.553us 41.81% 1.973ms 1.973ms 0.000us 0.00% 3.964ms 3.964ms 1 - FlashAttnFunc 1.94% 91.662us 39.30% 1.855ms 618.205us 0.000us 0.00% 3.964ms 1.321ms 3 - _flash_attn3_48fe103_dirty::fwd 1.07% 50.373us 37.36% 1.763ms 587.651us 2.962ms 100.00% 3.964ms 1.321ms 3 - hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.964ms 100.05% 2.964ms 2.964ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.962ms 100.00% 2.962ms 987.401us 3 - Activity Buffer Request 30.92% 1.459ms 30.92% 1.459ms 1.459ms 1.002ms 33.82% 1.002ms 1.002ms 1 - aten::empty 0.56% 26.451us 0.56% 26.451us 4.408us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.11% 5.270us 0.11% 5.270us 1.757us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 4.70% 221.845us 4.70% 221.845us 73.948us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 58.19% 2.746ms 58.19% 2.746ms 2.746ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn3 2.35% 113.792us 41.57% 2.016ms 2.016ms 0.000us 0.00% 4.078ms 4.078ms 1 + FlashAttnFunc 1.91% 92.684us 39.22% 1.902ms 634.112us 0.000us 0.00% 4.078ms 1.359ms 3 + _flash_attn3_48fe103_dirty::fwd 0.98% 47.600us 37.31% 1.810ms 603.217us 3.050ms 100.00% 4.078ms 1.359ms 3 + hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.052ms 100.05% 3.052ms 3.052ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.050ms 100.00% 3.050ms 1.017ms 3 + Activity Buffer Request 30.19% 1.464ms 30.19% 1.464ms 1.464ms 1.028ms 33.70% 1.028ms 1.028ms 1 + aten::empty 0.58% 28.221us 0.58% 28.221us 4.703us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.11% 5.430us 0.11% 5.430us 1.810us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 5.44% 264.046us 5.44% 264.046us 88.015us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 58.43% 2.834ms 58.43% 2.834ms 2.834ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.719ms -Self CUDA time total: 2.962ms +Self CPU time total: 4.851ms +Self CUDA time total: 3.050ms @@ -4013,19 +4021,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L448_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn3 2.19% 114.453us 37.34% 1.953ms 1.953ms 0.000us 0.00% 4.662ms 4.662ms 1 - FlashAttnFunc 1.73% 90.401us 35.15% 1.838ms 612.822us 0.000us 0.00% 4.662ms 1.554ms 3 - _flash_attn3_48fe103_dirty::fwd 0.97% 50.643us 33.42% 1.748ms 582.688us 3.490ms 100.00% 4.662ms 1.554ms 3 - hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.492ms 100.04% 3.492ms 3.492ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.490ms 100.00% 3.490ms 1.163ms 3 - Activity Buffer Request 28.44% 1.487ms 28.44% 1.487ms 1.487ms 1.171ms 33.56% 1.171ms 1.171ms 1 - aten::empty 0.52% 27.271us 0.52% 27.271us 4.545us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.09% 4.950us 0.09% 4.950us 1.650us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 3.40% 178.024us 3.40% 178.024us 59.341us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 62.66% 3.277ms 62.66% 3.277ms 3.277ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn3 2.29% 116.152us 37.60% 1.908ms 1.908ms 0.000us 0.00% 4.514ms 4.514ms 1 + FlashAttnFunc 1.78% 90.384us 35.31% 1.792ms 597.414us 0.000us 0.00% 4.514ms 1.505ms 3 + _flash_attn3_48fe103_dirty::fwd 0.91% 46.231us 33.53% 1.702ms 567.286us 3.379ms 100.00% 4.514ms 1.505ms 3 + hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.380ms 100.05% 3.380ms 3.380ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.379ms 100.00% 3.379ms 1.126ms 3 + Activity Buffer Request 28.41% 1.442ms 28.41% 1.442ms 1.442ms 1.136ms 33.61% 1.136ms 1.136ms 1 + aten::empty 0.54% 27.250us 0.54% 27.250us 4.542us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.10% 5.250us 0.10% 5.250us 1.750us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 3.57% 181.204us 3.57% 181.204us 60.401us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 62.40% 3.167ms 62.40% 3.167ms 3.167ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 5.230ms -Self CUDA time total: 3.490ms +Self CPU time total: 5.075ms +Self CUDA time total: 3.379ms @@ -4035,34 +4043,38 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L512_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn3 2.26% 115.663us 36.27% 1.854ms 1.854ms 0.000us 0.00% 4.679ms 4.679ms 1 - FlashAttnFunc 2.25% 114.773us 34.01% 1.738ms 579.364us 0.000us 0.00% 4.679ms 1.560ms 3 - _flash_attn3_48fe103_dirty::fwd 1.02% 51.933us 31.76% 1.623ms 541.107us 3.499ms 100.00% 4.679ms 1.560ms 3 - hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.500ms 100.04% 3.500ms 3.500ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.499ms 100.00% 3.499ms 1.166ms 3 - Activity Buffer Request 26.80% 1.370ms 26.80% 1.370ms 1.370ms 1.181ms 33.75% 1.181ms 1.181ms 1 - aten::empty 0.54% 27.681us 0.54% 27.681us 4.613us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.10% 5.079us 0.10% 5.079us 1.693us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 3.30% 168.813us 3.30% 168.813us 56.271us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 63.73% 3.257ms 63.73% 3.257ms 3.257ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn3 2.24% 115.243us 39.36% 2.021ms 2.021ms 0.000us 0.00% 4.438ms 4.438ms 1 + FlashAttnFunc 1.78% 91.262us 37.12% 1.906ms 635.278us 0.000us 0.00% 4.438ms 1.479ms 3 + _flash_attn3_48fe103_dirty::fwd 0.90% 46.212us 35.34% 1.815ms 604.857us 3.325ms 100.00% 4.438ms 1.479ms 3 + hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.327ms 100.04% 3.327ms 3.327ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.325ms 100.00% 3.325ms 1.108ms 3 + Activity Buffer Request 30.40% 1.561ms 30.40% 1.561ms 1.561ms 1.113ms 33.46% 1.113ms 1.113ms 1 + aten::empty 0.54% 27.780us 0.54% 27.780us 4.630us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.10% 5.330us 0.10% 5.330us 1.777us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 3.40% 174.454us 3.40% 174.454us 58.151us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 60.64% 3.113ms 60.64% 3.113ms 3.113ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 5.111ms -Self CUDA time total: 3.499ms +Self CPU time total: 5.134ms +Self CUDA time total: 3.325ms impl wl p50(ms) ok -hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.94 True -hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.97 True -hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.04 True -hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.05 True -hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.20 True -hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.18 True +hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 1.00 True +hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.99 True +hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.03 True +hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.02 True +hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.19 True +hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.15 True
-
-Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s] -Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.38it/s] -Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.75it/s] +
+
▶ UV Install Logs
+
+
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s] +Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.44it/s] +Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.88it/s]

Artifacts:

attention.jsonl diff --git a/flash_attn/impls/mem_efficient_attention.html b/flash_attn/impls/mem_efficient_attention.html index f6ab4e24cf377304db7fbbedb7a4571918177b17..0f163f93535fa6d107f3aa43300a941584b7b578 100644 --- a/flash_attn/impls/mem_efficient_attention.html +++ b/flash_attn/impls/mem_efficient_attention.html @@ -809,6 +809,14 @@ .artifact-preview svg { background: transparent; } + /* Invert SVG images in dark mode */ + :root[data-theme="dark"] .artifact-preview img[src$=".svg"] { + filter: invert(0.9) hue-rotate(180deg); + } + /* Keep SVG images readable in monocolor mode */ + :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] { + filter: none; + } /* CSV table styling */ .artifact-csv { margin-top: 1rem; @@ -3871,7 +3879,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: benchmark | 3.92s +Cell: benchmark | 3.89s | Raw @@ -3924,28 +3932,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L128_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_mem_eff 4.77% 333.269us 32.71% 2.284ms 2.284ms 0.000us 0.00% 5.420ms 5.420ms 1 - torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.402ms 100.61% 5.402ms 5.402ms 1 - aten::scaled_dot_product_attention 0.44% 30.450us 2.54% 177.435us 59.145us 0.000us 0.00% 4.753ms 1.584ms 3 - aten::_scaled_dot_product_efficient_attention 0.33% 22.722us 2.10% 146.985us 48.995us 0.000us 0.00% 4.753ms 1.584ms 3 - aten::_efficient_attention_forward 0.51% 35.382us 1.42% 99.273us 33.091us 4.753ms 88.51% 4.753ms 1.584ms 3 -fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.753ms 88.51% 4.753ms 1.584ms 3 - aten::contiguous 0.17% 11.660us 24.51% 1.712ms 190.185us 0.000us 0.00% 667.266us 74.141us 9 - aten::clone 0.46% 31.810us 24.34% 1.700ms 188.889us 0.000us 0.00% 667.266us 74.141us 9 - aten::copy_ 1.01% 70.871us 22.86% 1.597ms 177.404us 616.738us 11.49% 667.266us 74.141us 9 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 616.738us 11.49% 616.738us 68.526us 9 - Activity Buffer Request 20.64% 1.441ms 20.64% 1.441ms 1.441ms 50.528us 0.94% 50.528us 50.528us 1 - aten::transpose 0.91% 63.619us 1.25% 87.011us 3.625us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.33% 23.392us 0.33% 23.392us 0.975us 0.000us 0.00% 0.000us 0.000us 24 - aten::empty_like 0.24% 16.972us 1.02% 71.553us 7.950us 0.000us 0.00% 0.000us 0.000us 9 - aten::empty 1.18% 82.691us 1.18% 82.691us 3.938us 0.000us 0.00% 0.000us 0.000us 21 - cudaLaunchKernel 1.55% 108.383us 1.55% 108.383us 9.032us 0.000us 0.00% 0.000us 0.000us 12 - cudaStreamIsCapturing 0.05% 3.260us 0.05% 3.260us 1.087us 0.000us 0.00% 0.000us 0.000us 3 - cudaFuncSetAttribute 0.12% 8.450us 0.12% 8.450us 2.817us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 67.29% 4.700ms 67.29% 4.700ms 4.700ms 0.000us 0.00% 0.000us 0.000us 1 + torch_mem_eff 5.04% 355.427us 33.26% 2.347ms 2.347ms 0.000us 0.00% 5.443ms 5.443ms 1 + torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.441ms 100.90% 5.441ms 5.441ms 1 + aten::scaled_dot_product_attention 0.45% 31.972us 2.63% 185.885us 61.962us 0.000us 0.00% 4.772ms 1.591ms 3 + aten::_scaled_dot_product_efficient_attention 0.35% 24.621us 2.18% 153.913us 51.304us 0.000us 0.00% 4.772ms 1.591ms 3 + aten::_efficient_attention_forward 0.53% 37.509us 1.49% 105.321us 35.107us 4.772ms 88.48% 4.772ms 1.591ms 3 +fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.772ms 88.48% 4.772ms 1.591ms 3 + aten::contiguous 0.16% 11.612us 24.73% 1.745ms 193.873us 0.000us 0.00% 671.455us 74.606us 9 + aten::clone 0.45% 31.980us 24.56% 1.733ms 192.583us 0.000us 0.00% 671.455us 74.606us 9 + aten::copy_ 1.09% 76.971us 23.11% 1.631ms 181.191us 621.119us 11.52% 671.455us 74.606us 9 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 621.119us 11.52% 621.119us 69.013us 9 + Activity Buffer Request 20.82% 1.469ms 20.82% 1.469ms 1.469ms 50.336us 0.93% 50.336us 50.336us 1 + aten::transpose 0.89% 62.923us 1.20% 84.503us 3.521us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.31% 21.580us 0.31% 21.580us 0.899us 0.000us 0.00% 0.000us 0.000us 24 + aten::empty_like 0.23% 16.040us 1.00% 70.551us 7.839us 0.000us 0.00% 0.000us 0.000us 9 + aten::empty 1.20% 84.702us 1.20% 84.702us 4.033us 0.000us 0.00% 0.000us 0.000us 21 + cudaLaunchKernel 1.56% 109.883us 1.56% 109.883us 9.157us 0.000us 0.00% 0.000us 0.000us 12 + cudaStreamIsCapturing 0.04% 3.130us 0.04% 3.130us 1.043us 0.000us 0.00% 0.000us 0.000us 3 + cudaFuncSetAttribute 0.13% 9.350us 0.13% 9.350us 3.117us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 66.74% 4.709ms 66.74% 4.709ms 4.709ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 6.984ms -Self CUDA time total: 5.369ms +Self CPU time total: 7.056ms +Self CUDA time total: 5.393ms @@ -3955,28 +3963,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L256_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_mem_eff 3.53% 251.015us 29.52% 2.098ms 2.098ms 0.000us 0.00% 5.633ms 5.633ms 1 - torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.587ms 100.15% 5.587ms 5.587ms 1 - aten::scaled_dot_product_attention 0.25% 17.630us 2.05% 145.594us 48.531us 0.000us 0.00% 4.943ms 1.648ms 3 - aten::_scaled_dot_product_efficient_attention 0.28% 19.810us 1.80% 127.964us 42.655us 0.000us 0.00% 4.943ms 1.648ms 3 - aten::_efficient_attention_forward 0.42% 29.862us 1.18% 83.512us 27.837us 4.943ms 88.61% 4.943ms 1.648ms 3 -fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.943ms 88.61% 4.943ms 1.648ms 3 - aten::contiguous 0.10% 7.191us 23.30% 1.656ms 184.002us 0.000us 0.00% 689.540us 76.616us 9 - aten::clone 0.33% 23.318us 23.20% 1.649ms 183.203us 0.000us 0.00% 689.540us 76.616us 9 - aten::copy_ 0.92% 65.725us 22.12% 1.572ms 174.717us 635.140us 11.39% 689.540us 76.616us 9 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 635.140us 11.39% 635.140us 70.571us 9 - Activity Buffer Request 20.24% 1.439ms 20.24% 1.439ms 1.439ms 54.400us 0.98% 54.400us 54.400us 1 - aten::transpose 0.71% 50.494us 0.99% 70.123us 2.922us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.28% 19.629us 0.28% 19.629us 0.818us 0.000us 0.00% 0.000us 0.000us 24 - aten::empty_like 0.18% 12.608us 0.75% 53.061us 5.896us 0.000us 0.00% 0.000us 0.000us 9 - aten::empty 0.94% 66.903us 0.94% 66.903us 3.186us 0.000us 0.00% 0.000us 0.000us 21 - cudaLaunchKernel 1.25% 89.012us 1.25% 89.012us 7.418us 0.000us 0.00% 0.000us 0.000us 12 - cudaStreamIsCapturing 0.03% 2.220us 0.03% 2.220us 0.740us 0.000us 0.00% 0.000us 0.000us 3 - cudaFuncSetAttribute 0.05% 3.880us 0.05% 3.880us 1.293us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 70.48% 5.009ms 70.48% 5.009ms 5.009ms 0.000us 0.00% 0.000us 0.000us 1 + torch_mem_eff 3.16% 230.972us 28.28% 2.069ms 2.069ms 0.000us 0.00% 5.837ms 5.837ms 1 + torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.791ms 100.14% 5.791ms 5.791ms 1 + aten::scaled_dot_product_attention 0.28% 20.721us 1.89% 138.014us 46.005us 0.000us 0.00% 5.147ms 1.716ms 3 + aten::_scaled_dot_product_efficient_attention 0.25% 18.299us 1.60% 117.293us 39.098us 0.000us 0.00% 5.147ms 1.716ms 3 + aten::_efficient_attention_forward 0.37% 27.244us 1.07% 78.053us 26.018us 5.147ms 89.00% 5.147ms 1.716ms 3 +fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.147ms 89.00% 5.147ms 1.716ms 3 + aten::contiguous 0.10% 7.473us 22.69% 1.660ms 184.464us 0.000us 0.00% 690.528us 76.725us 9 + aten::clone 0.31% 22.407us 22.59% 1.653ms 183.634us 0.000us 0.00% 690.528us 76.725us 9 + aten::copy_ 0.90% 65.683us 21.62% 1.582ms 175.735us 636.032us 11.00% 690.528us 76.725us 9 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 636.032us 11.00% 636.032us 70.670us 9 + Activity Buffer Request 19.82% 1.450ms 19.82% 1.450ms 1.450ms 54.496us 0.94% 54.496us 54.496us 1 + aten::transpose 0.62% 45.174us 0.83% 60.723us 2.530us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.21% 15.549us 0.21% 15.549us 0.648us 0.000us 0.00% 0.000us 0.000us 24 + aten::empty_like 0.16% 11.973us 0.67% 48.683us 5.409us 0.000us 0.00% 0.000us 0.000us 9 + aten::empty 0.84% 61.270us 0.84% 61.270us 2.918us 0.000us 0.00% 0.000us 0.000us 21 + cudaLaunchKernel 1.18% 86.180us 1.18% 86.180us 7.182us 0.000us 0.00% 0.000us 0.000us 12 + cudaStreamIsCapturing 0.03% 2.460us 0.03% 2.460us 0.820us 0.000us 0.00% 0.000us 0.000us 3 + cudaFuncSetAttribute 0.04% 3.159us 0.04% 3.159us 1.053us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 71.72% 5.248ms 71.72% 5.248ms 5.248ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 7.107ms -Self CUDA time total: 5.578ms +Self CPU time total: 7.317ms +Self CUDA time total: 5.783ms @@ -3986,28 +3994,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L320_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_mem_eff 3.28% 246.598us 28.54% 2.146ms 2.146ms 0.000us 0.00% 6.014ms 6.014ms 1 - torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.967ms 100.18% 5.967ms 5.967ms 1 - aten::scaled_dot_product_attention 0.24% 18.181us 1.92% 144.583us 48.194us 0.000us 0.00% 5.302ms 1.767ms 3 - aten::_scaled_dot_product_efficient_attention 0.27% 19.980us 1.68% 126.402us 42.134us 0.000us 0.00% 5.302ms 1.767ms 3 - aten::_efficient_attention_forward 0.38% 28.571us 1.10% 82.521us 27.507us 5.302ms 89.01% 5.302ms 1.767ms 3 -fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.302ms 89.01% 5.302ms 1.767ms 3 - aten::contiguous 0.09% 6.930us 22.70% 1.707ms 189.666us 0.000us 0.00% 712.547us 79.172us 9 - aten::clone 0.30% 22.691us 22.61% 1.700ms 188.896us 0.000us 0.00% 712.547us 79.172us 9 - aten::copy_ 1.08% 81.024us 21.57% 1.622ms 180.228us 654.403us 10.99% 712.547us 79.172us 9 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 654.403us 10.99% 654.403us 72.711us 9 - Activity Buffer Request 19.57% 1.471ms 19.57% 1.471ms 1.471ms 58.144us 0.98% 58.144us 58.144us 1 - aten::transpose 0.68% 51.431us 0.95% 71.351us 2.973us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.26% 19.920us 0.26% 19.920us 0.830us 0.000us 0.00% 0.000us 0.000us 24 - aten::empty_like 0.16% 11.979us 0.74% 55.320us 6.147us 0.000us 0.00% 0.000us 0.000us 9 - aten::empty 0.93% 69.561us 0.93% 69.561us 3.312us 0.000us 0.00% 0.000us 0.000us 21 - cudaLaunchKernel 1.22% 91.652us 1.22% 91.652us 7.638us 0.000us 0.00% 0.000us 0.000us 12 - cudaStreamIsCapturing 0.03% 2.359us 0.03% 2.359us 0.786us 0.000us 0.00% 0.000us 0.000us 3 - cudaFuncSetAttribute 0.05% 3.430us 0.05% 3.430us 1.143us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 71.46% 5.373ms 71.46% 5.373ms 5.373ms 0.000us 0.00% 0.000us 0.000us 1 + torch_mem_eff 3.27% 244.917us 27.45% 2.054ms 2.054ms 0.000us 0.00% 6.034ms 6.034ms 1 + torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.984ms 100.14% 5.984ms 5.984ms 1 + aten::scaled_dot_product_attention 0.26% 19.270us 1.91% 142.603us 47.534us 0.000us 0.00% 5.315ms 1.772ms 3 + aten::_scaled_dot_product_efficient_attention 0.25% 18.622us 1.65% 123.333us 41.111us 0.000us 0.00% 5.315ms 1.772ms 3 + aten::_efficient_attention_forward 0.37% 27.710us 1.08% 80.560us 26.853us 5.315ms 88.95% 5.315ms 1.772ms 3 +fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.315ms 88.95% 5.315ms 1.772ms 3 + aten::contiguous 0.10% 7.220us 21.76% 1.628ms 180.911us 0.000us 0.00% 718.878us 79.875us 9 + aten::clone 0.29% 21.638us 21.66% 1.621ms 180.109us 0.000us 0.00% 718.878us 79.875us 9 + aten::copy_ 0.91% 68.381us 20.73% 1.551ms 172.378us 660.254us 11.05% 718.878us 79.875us 9 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 660.254us 11.05% 660.254us 73.362us 9 + Activity Buffer Request 18.95% 1.418ms 18.95% 1.418ms 1.418ms 58.624us 0.98% 58.624us 58.624us 1 + aten::transpose 0.63% 46.916us 0.84% 62.771us 2.615us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.21% 15.855us 0.21% 15.855us 0.661us 0.000us 0.00% 0.000us 0.000us 24 + aten::empty_like 0.15% 11.482us 0.64% 47.942us 5.327us 0.000us 0.00% 0.000us 0.000us 9 + aten::empty 0.82% 61.110us 0.82% 61.110us 2.910us 0.000us 0.00% 0.000us 0.000us 21 + cudaLaunchKernel 1.17% 87.854us 1.17% 87.854us 7.321us 0.000us 0.00% 0.000us 0.000us 12 + cudaStreamIsCapturing 0.03% 2.410us 0.03% 2.410us 0.803us 0.000us 0.00% 0.000us 0.000us 3 + cudaFuncSetAttribute 0.04% 2.950us 0.04% 2.950us 0.983us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 72.55% 5.429ms 72.55% 5.429ms 5.429ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 7.519ms -Self CUDA time total: 5.956ms +Self CPU time total: 7.483ms +Self CUDA time total: 5.976ms @@ -4017,28 +4025,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L384_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_mem_eff 3.21% 251.576us 29.97% 2.347ms 2.347ms 0.000us 0.00% 6.116ms 6.116ms 1 - torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.068ms 100.14% 6.068ms 6.068ms 1 - aten::scaled_dot_product_attention 0.24% 18.800us 1.87% 146.693us 48.898us 0.000us 0.00% 5.408ms 1.803ms 3 - aten::_scaled_dot_product_efficient_attention 0.25% 19.900us 1.63% 127.893us 42.631us 0.000us 0.00% 5.408ms 1.803ms 3 - aten::_efficient_attention_forward 0.38% 29.372us 1.07% 83.903us 27.968us 5.408ms 89.25% 5.408ms 1.803ms 3 -fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.408ms 89.25% 5.408ms 1.803ms 3 - aten::contiguous 0.10% 7.511us 24.29% 1.902ms 211.340us 0.000us 0.00% 708.735us 78.748us 9 - aten::clone 0.28% 21.872us 24.19% 1.895ms 210.505us 0.000us 0.00% 708.735us 78.748us 9 - aten::copy_ 0.85% 66.540us 23.20% 1.817ms 201.834us 651.551us 10.75% 708.735us 78.748us 9 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 651.551us 10.75% 651.551us 72.395us 9 - Activity Buffer Request 18.68% 1.462ms 18.68% 1.462ms 1.462ms 57.184us 0.94% 57.184us 57.184us 1 - aten::transpose 0.65% 50.781us 0.90% 70.402us 2.933us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.25% 19.621us 0.25% 19.621us 0.818us 0.000us 0.00% 0.000us 0.000us 24 - aten::empty_like 0.15% 11.809us 0.72% 56.170us 6.241us 0.000us 0.00% 0.000us 0.000us 9 - aten::empty 0.90% 70.242us 0.90% 70.242us 3.345us 0.000us 0.00% 0.000us 0.000us 21 - cudaLaunchKernel 3.97% 310.797us 3.97% 310.797us 25.900us 0.000us 0.00% 0.000us 0.000us 12 - cudaStreamIsCapturing 0.03% 2.250us 0.03% 2.250us 0.750us 0.000us 0.00% 0.000us 0.000us 3 - cudaFuncSetAttribute 0.04% 3.220us 0.04% 3.220us 1.073us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 70.03% 5.484ms 70.03% 5.484ms 5.484ms 0.000us 0.00% 0.000us 0.000us 1 + torch_mem_eff 3.13% 245.154us 29.09% 2.280ms 2.280ms 0.000us 0.00% 6.166ms 6.166ms 1 + torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.117ms 100.15% 6.117ms 6.117ms 1 + aten::scaled_dot_product_attention 0.24% 18.991us 1.80% 140.753us 46.918us 0.000us 0.00% 5.454ms 1.818ms 3 + aten::_scaled_dot_product_efficient_attention 0.25% 19.741us 1.55% 121.762us 40.587us 0.000us 0.00% 5.454ms 1.818ms 3 + aten::_efficient_attention_forward 0.36% 27.980us 1.01% 79.030us 26.343us 5.454ms 89.29% 5.454ms 1.818ms 3 +fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.454ms 89.29% 5.454ms 1.818ms 3 + aten::contiguous 0.10% 7.853us 23.65% 1.854ms 206.016us 0.000us 0.00% 711.999us 79.111us 9 + aten::clone 0.28% 21.760us 23.55% 1.846ms 205.144us 0.000us 0.00% 711.999us 79.111us 9 + aten::copy_ 0.86% 67.621us 22.63% 1.774ms 197.124us 654.399us 10.71% 711.999us 79.111us 9 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 654.399us 10.71% 654.399us 72.711us 9 + Activity Buffer Request 18.63% 1.461ms 18.63% 1.461ms 1.461ms 57.600us 0.94% 57.600us 57.600us 1 + aten::transpose 0.60% 47.388us 0.81% 63.381us 2.641us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.20% 15.993us 0.20% 15.993us 0.666us 0.000us 0.00% 0.000us 0.000us 24 + aten::empty_like 0.15% 12.039us 0.64% 50.420us 5.602us 0.000us 0.00% 0.000us 0.000us 9 + aten::empty 0.81% 63.411us 0.81% 63.411us 3.020us 0.000us 0.00% 0.000us 0.000us 21 + cudaLaunchKernel 3.40% 266.437us 3.40% 266.437us 22.203us 0.000us 0.00% 0.000us 0.000us 12 + cudaStreamIsCapturing 0.03% 2.470us 0.03% 2.470us 0.823us 0.000us 0.00% 0.000us 0.000us 3 + cudaFuncSetAttribute 0.04% 3.000us 0.04% 3.000us 1.000us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 70.91% 5.560ms 70.91% 5.560ms 5.560ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 7.830ms -Self CUDA time total: 6.059ms +Self CPU time total: 7.840ms +Self CUDA time total: 6.108ms @@ -4048,28 +4056,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L448_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_mem_eff 3.15% 250.575us 28.50% 2.270ms 2.270ms 0.000us 0.00% 6.322ms 6.322ms 1 - torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.270ms 100.14% 6.270ms 6.270ms 1 - aten::scaled_dot_product_attention 0.22% 17.572us 1.82% 145.084us 48.361us 0.000us 0.00% 5.598ms 1.866ms 3 - aten::_scaled_dot_product_efficient_attention 0.24% 19.250us 1.60% 127.512us 42.504us 0.000us 0.00% 5.598ms 1.866ms 3 - aten::_efficient_attention_forward 0.36% 28.812us 1.05% 83.962us 27.987us 5.598ms 89.40% 5.598ms 1.866ms 3 -fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.598ms 89.40% 5.598ms 1.866ms 3 - aten::contiguous 0.09% 6.912us 22.94% 1.827ms 203.045us 0.000us 0.00% 724.000us 80.444us 9 - aten::clone 0.28% 21.949us 22.86% 1.820ms 202.277us 0.000us 0.00% 724.000us 80.444us 9 - aten::copy_ 0.82% 65.091us 21.89% 1.744ms 193.745us 664.032us 10.60% 724.000us 80.444us 9 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 664.032us 10.60% 664.032us 73.781us 9 - Activity Buffer Request 18.02% 1.435ms 18.02% 1.435ms 1.435ms 59.968us 0.96% 59.968us 59.968us 1 - aten::transpose 0.64% 50.930us 0.89% 70.859us 2.952us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.25% 19.929us 0.25% 19.929us 0.830us 0.000us 0.00% 0.000us 0.000us 24 - aten::empty_like 0.15% 12.022us 0.69% 54.843us 6.094us 0.000us 0.00% 0.000us 0.000us 9 - aten::empty 0.87% 69.430us 0.87% 69.430us 3.306us 0.000us 0.00% 0.000us 0.000us 21 - cudaLaunchKernel 3.34% 266.388us 3.34% 266.388us 22.199us 0.000us 0.00% 0.000us 0.000us 12 - cudaStreamIsCapturing 0.03% 2.320us 0.03% 2.320us 0.773us 0.000us 0.00% 0.000us 0.000us 3 - cudaFuncSetAttribute 0.04% 3.120us 0.04% 3.120us 1.040us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 71.50% 5.695ms 71.50% 5.695ms 5.695ms 0.000us 0.00% 0.000us 0.000us 1 + torch_mem_eff 3.12% 251.727us 28.35% 2.287ms 2.287ms 0.000us 0.00% 6.402ms 6.402ms 1 + torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.350ms 100.14% 6.350ms 6.350ms 1 + aten::scaled_dot_product_attention 0.24% 19.272us 1.78% 143.434us 47.811us 0.000us 0.00% 5.676ms 1.892ms 3 + aten::_scaled_dot_product_efficient_attention 0.24% 19.071us 1.54% 124.162us 41.387us 0.000us 0.00% 5.676ms 1.892ms 3 + aten::_efficient_attention_forward 0.36% 28.918us 1.02% 82.141us 27.380us 5.676ms 89.51% 5.676ms 1.892ms 3 +fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.676ms 89.51% 5.676ms 1.892ms 3 + aten::contiguous 0.09% 7.578us 22.96% 1.852ms 205.774us 0.000us 0.00% 725.410us 80.601us 9 + aten::clone 0.27% 22.113us 22.87% 1.844ms 204.932us 0.000us 0.00% 725.410us 80.601us 9 + aten::copy_ 0.85% 68.201us 21.96% 1.771ms 196.780us 665.282us 10.49% 725.410us 80.601us 9 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 665.282us 10.49% 665.282us 73.920us 9 + Activity Buffer Request 18.11% 1.461ms 18.11% 1.461ms 1.461ms 60.128us 0.95% 60.128us 60.128us 1 + aten::transpose 0.57% 46.288us 0.78% 62.529us 2.605us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.20% 16.241us 0.20% 16.241us 0.677us 0.000us 0.00% 0.000us 0.000us 24 + aten::empty_like 0.15% 12.469us 0.64% 51.250us 5.694us 0.000us 0.00% 0.000us 0.000us 9 + aten::empty 0.80% 64.494us 0.80% 64.494us 3.071us 0.000us 0.00% 0.000us 0.000us 21 + cudaLaunchKernel 3.27% 263.876us 3.27% 263.876us 21.990us 0.000us 0.00% 0.000us 0.000us 12 + cudaStreamIsCapturing 0.03% 2.440us 0.03% 2.440us 0.813us 0.000us 0.00% 0.000us 0.000us 3 + cudaFuncSetAttribute 0.04% 3.380us 0.04% 3.380us 1.127us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 71.65% 5.779ms 71.65% 5.779ms 5.779ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 7.965ms -Self CUDA time total: 6.262ms +Self CPU time total: 8.066ms +Self CUDA time total: 6.342ms @@ -4079,36 +4087,36 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L512_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_mem_eff 3.00% 248.403us 26.98% 2.232ms 2.232ms 0.000us 0.00% 6.668ms 6.668ms 1 - torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.616ms 100.13% 6.616ms 6.616ms 1 - aten::scaled_dot_product_attention 0.21% 17.221us 1.72% 142.654us 47.551us 0.000us 0.00% 5.939ms 1.980ms 3 - aten::_scaled_dot_product_efficient_attention 0.23% 18.779us 1.52% 125.433us 41.811us 0.000us 0.00% 5.939ms 1.980ms 3 - aten::_efficient_attention_forward 0.34% 28.440us 0.99% 81.712us 27.237us 5.939ms 89.88% 5.939ms 1.980ms 3 -fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.939ms 89.88% 5.939ms 1.980ms 3 - aten::contiguous 0.08% 6.861us 21.66% 1.792ms 199.142us 0.000us 0.00% 729.440us 81.049us 9 - aten::clone 0.26% 21.352us 21.58% 1.785ms 198.379us 0.000us 0.00% 729.440us 81.049us 9 - aten::copy_ 0.83% 69.012us 20.65% 1.709ms 189.858us 668.928us 10.12% 729.440us 81.049us 9 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 668.928us 10.12% 668.928us 74.325us 9 - Activity Buffer Request 17.29% 1.430ms 17.29% 1.430ms 1.430ms 60.512us 0.92% 60.512us 60.512us 1 - aten::transpose 0.63% 51.780us 0.89% 73.784us 3.074us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.27% 22.004us 0.27% 22.004us 0.917us 0.000us 0.00% 0.000us 0.000us 24 - aten::empty_like 0.14% 11.870us 0.67% 55.340us 6.149us 0.000us 0.00% 0.000us 0.000us 9 - aten::empty 0.84% 69.312us 0.84% 69.312us 3.301us 0.000us 0.00% 0.000us 0.000us 21 - cudaLaunchKernel 2.79% 231.145us 2.79% 231.145us 19.262us 0.000us 0.00% 0.000us 0.000us 12 - cudaStreamIsCapturing 0.03% 2.280us 0.03% 2.280us 0.760us 0.000us 0.00% 0.000us 0.000us 3 - cudaFuncSetAttribute 0.04% 3.570us 0.04% 3.570us 1.190us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 73.02% 6.041ms 73.02% 6.041ms 6.041ms 0.000us 0.00% 0.000us 0.000us 1 + torch_mem_eff 2.86% 239.115us 26.99% 2.259ms 2.259ms 0.000us 0.00% 6.718ms 6.718ms 1 + torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.665ms 100.13% 6.665ms 6.665ms 1 + aten::scaled_dot_product_attention 0.23% 19.210us 1.67% 139.873us 46.624us 0.000us 0.00% 5.983ms 1.994ms 3 + aten::_scaled_dot_product_efficient_attention 0.22% 18.712us 1.44% 120.663us 40.221us 0.000us 0.00% 5.983ms 1.994ms 3 + aten::_efficient_attention_forward 0.33% 27.381us 0.94% 78.541us 26.180us 5.983ms 89.89% 5.983ms 1.994ms 3 +fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.983ms 89.89% 5.983ms 1.994ms 3 + aten::contiguous 0.09% 7.469us 21.99% 1.841ms 204.601us 0.000us 0.00% 734.336us 81.593us 9 + aten::clone 0.27% 22.450us 21.90% 1.834ms 203.772us 0.000us 0.00% 734.336us 81.593us 9 + aten::copy_ 0.80% 67.050us 21.01% 1.759ms 195.442us 673.088us 10.11% 734.336us 81.593us 9 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 673.088us 10.11% 673.088us 74.788us 9 + Activity Buffer Request 17.30% 1.449ms 17.30% 1.449ms 1.449ms 61.248us 0.92% 61.248us 61.248us 1 + aten::transpose 0.55% 46.102us 0.74% 62.332us 2.597us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.19% 16.230us 0.19% 16.230us 0.676us 0.000us 0.00% 0.000us 0.000us 24 + aten::empty_like 0.14% 11.891us 0.63% 52.512us 5.835us 0.000us 0.00% 0.000us 0.000us 9 + aten::empty 0.78% 65.061us 0.78% 65.061us 3.098us 0.000us 0.00% 0.000us 0.000us 21 + cudaLaunchKernel 3.16% 264.678us 3.16% 264.678us 22.056us 0.000us 0.00% 0.000us 0.000us 12 + cudaStreamIsCapturing 0.03% 2.400us 0.03% 2.400us 0.800us 0.000us 0.00% 0.000us 0.000us 3 + cudaFuncSetAttribute 0.04% 2.990us 0.04% 2.990us 0.997us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 73.01% 6.113ms 73.01% 6.113ms 6.113ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 8.273ms -Self CUDA time total: 6.608ms +Self CPU time total: 8.372ms +Self CUDA time total: 6.656ms impl wl p50(ms) ok torch_mem_eff cuda_attn_L128_bfloat16 1.83 True -torch_mem_eff cuda_attn_L256_bfloat16 1.89 True -torch_mem_eff cuda_attn_L320_bfloat16 2.00 True -torch_mem_eff cuda_attn_L384_bfloat16 1.97 True -torch_mem_eff cuda_attn_L448_bfloat16 2.06 True +torch_mem_eff cuda_attn_L256_bfloat16 1.94 True +torch_mem_eff cuda_attn_L320_bfloat16 1.95 True +torch_mem_eff cuda_attn_L384_bfloat16 2.05 True +torch_mem_eff cuda_attn_L448_bfloat16 2.07 True torch_mem_eff cuda_attn_L512_bfloat16 2.19 True
diff --git a/flash_attn/impls/sage_attention.html b/flash_attn/impls/sage_attention.html index 9d07a2ce157ec6414ddbe4c27bea52ef7ed253b0..db09302998effc2a2495463eeac998d4c82afce6 100644 --- a/flash_attn/impls/sage_attention.html +++ b/flash_attn/impls/sage_attention.html @@ -809,6 +809,14 @@ .artifact-preview svg { background: transparent; } + /* Invert SVG images in dark mode */ + :root[data-theme="dark"] .artifact-preview img[src$=".svg"] { + filter: invert(0.9) hue-rotate(180deg); + } + /* Keep SVG images readable in monocolor mode */ + :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] { + filter: none; + } /* CSV table styling */ .artifact-csv { margin-top: 1rem; @@ -3869,9 +3877,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ code ▼ output - ▶ uv-logs + ▶ uv-logs | -Cell: benchmark | 4.53s +Cell: benchmark | 4.19s | Raw @@ -3920,28 +3928,23 @@ Cell: benchmark | 4.53s
Running attention benchmark on cuda with 6 workloads.
 impl                     wl                  p50(ms)  ok
 sage_int8_fp16           cuda_attn_L128_bfloat16    FAIL  False
-  Error: module 'sage_attention_5c963cbdaf16559b' has no attribute 'fwd'
+  Error: module 'sage_attention_d4f4a6803f593c0b' has no attribute 'fwd'
 sage_int8_fp16           cuda_attn_L256_bfloat16    FAIL  False
-  Error: module 'sage_attention_5c963cbdaf16559b' has no attribute 'fwd'
+  Error: module 'sage_attention_d4f4a6803f593c0b' has no attribute 'fwd'
 sage_int8_fp16           cuda_attn_L320_bfloat16    FAIL  False
-  Error: module 'sage_attention_5c963cbdaf16559b' has no attribute 'fwd'
+  Error: module 'sage_attention_d4f4a6803f593c0b' has no attribute 'fwd'
 sage_int8_fp16           cuda_attn_L384_bfloat16    FAIL  False
-  Error: module 'sage_attention_5c963cbdaf16559b' has no attribute 'fwd'
+  Error: module 'sage_attention_d4f4a6803f593c0b' has no attribute 'fwd'
 sage_int8_fp16           cuda_attn_L448_bfloat16    FAIL  False
-  Error: module 'sage_attention_5c963cbdaf16559b' has no attribute 'fwd'
+  Error: module 'sage_attention_d4f4a6803f593c0b' has no attribute 'fwd'
 sage_int8_fp16           cuda_attn_L512_bfloat16    FAIL  False
-  Error: module 'sage_attention_5c963cbdaf16559b' has no attribute 'fwd'
+  Error: module 'sage_attention_d4f4a6803f593c0b' has no attribute 'fwd'
 
-
-
▶ UV Install Logs
- +
+Fetching 11 files: 0%| | 0/11 [00:00<?, ?it/s] +Fetching 11 files: 73%|███████▎ | 8/11 [00:00<00:00, 13.96it/s] +Fetching 11 files: 100%|██████████| 11/11 [00:00<00:00, 19.18it/s]
-
Fetching 11 files: 0%| | 0/11 [00:00<?, ?it/s] -Fetching 11 files: 18%|█▊ | 2/11 [00:00<00:00, 15.79it/s] -Fetching 11 files: 73%|███████▎ | 8/11 [00:00<00:00, 13.55it/s] -Fetching 11 files: 100%|██████████| 11/11 [00:00<00:00, 18.83it/s]

Artifacts:

attention.jsonl diff --git a/flash_attn/impls/xformers.html b/flash_attn/impls/xformers.html index 6363e024de1afb10cb31713f99cf844d998ebe90..54dd655ec44ee5446d98dfae5cd6b8726476ab88 100644 --- a/flash_attn/impls/xformers.html +++ b/flash_attn/impls/xformers.html @@ -809,6 +809,14 @@ .artifact-preview svg { background: transparent; } + /* Invert SVG images in dark mode */ + :root[data-theme="dark"] .artifact-preview img[src$=".svg"] { + filter: invert(0.9) hue-rotate(180deg); + } + /* Keep SVG images readable in monocolor mode */ + :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] { + filter: none; + } /* CSV table styling */ .artifact-csv { margin-top: 1rem; @@ -3871,7 +3879,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: benchmark | 5.02s +Cell: benchmark | 33.44s | Raw @@ -3923,21 +3931,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L128_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - xformers_meff 10.85% 481.112us 51.55% 2.285ms 2.285ms 0.000us 0.00% 3.582ms 3.582ms 1 - xformers_flash3::flash_fwd 4.56% 202.185us 39.85% 1.766ms 588.715us 0.000us 0.00% 3.582ms 1.194ms 3 - flash_attn_3::fwd 1.68% 74.662us 35.29% 1.564ms 521.320us 2.681ms 100.00% 3.582ms 1.194ms 3 - xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.682ms 100.06% 2.682ms 2.682ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.681ms 100.00% 2.681ms 893.515us 3 - Activity Buffer Request 31.74% 1.407ms 31.74% 1.407ms 1.407ms 901.761us 33.64% 901.761us 901.761us 1 - aten::empty 0.77% 33.920us 0.77% 33.920us 5.653us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.23% 10.152us 0.23% 10.152us 3.384us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 0.87% 38.521us 0.87% 38.521us 12.840us 0.000us 0.00% 0.000us 0.000us 3 - aten::reshape 0.29% 13.028us 0.85% 37.710us 6.285us 0.000us 0.00% 0.000us 0.000us 6 - aten::view 0.56% 24.682us 0.56% 24.682us 4.114us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 48.45% 2.147ms 48.45% 2.147ms 2.147ms 0.000us 0.00% 0.000us 0.000us 1 + xformers_meff 9.89% 457.200us 48.78% 2.255ms 2.255ms 0.000us 0.00% 3.820ms 3.820ms 1 + xformers_flash3::flash_fwd 3.84% 177.424us 38.10% 1.761ms 587.077us 0.000us 0.00% 3.820ms 1.273ms 3 + flash_attn_3::fwd 1.55% 71.862us 34.26% 1.584ms 527.935us 2.885ms 100.00% 3.820ms 1.273ms 3 + xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.886ms 100.04% 2.886ms 2.886ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.885ms 100.00% 2.885ms 961.658us 3 + Activity Buffer Request 30.73% 1.420ms 30.73% 1.420ms 1.420ms 934.553us 32.39% 934.553us 934.553us 1 + aten::empty 0.74% 34.201us 0.74% 34.201us 5.700us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.22% 10.110us 0.22% 10.110us 3.370us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 1.02% 47.230us 1.02% 47.230us 15.743us 0.000us 0.00% 0.000us 0.000us 3 + aten::reshape 0.34% 15.510us 0.79% 36.581us 6.097us 0.000us 0.00% 0.000us 0.000us 6 + aten::view 0.46% 21.071us 0.46% 21.071us 3.512us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 51.22% 2.368ms 51.22% 2.368ms 2.368ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.432ms -Self CUDA time total: 2.681ms +Self CPU time total: 4.623ms +Self CUDA time total: 2.885ms @@ -3947,21 +3955,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L256_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - xformers_meff 7.16% 317.438us 45.96% 2.036ms 2.036ms 0.000us 0.00% 3.779ms 3.779ms 1 - xformers_flash3::flash_fwd 3.35% 148.243us 38.25% 1.695ms 564.991us 0.000us 0.00% 3.779ms 1.260ms 3 - flash_attn_3::fwd 1.25% 55.403us 34.91% 1.547ms 515.576us 2.825ms 100.00% 3.779ms 1.260ms 3 - xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.827ms 100.05% 2.827ms 2.827ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.825ms 100.00% 2.825ms 941.739us 3 - Activity Buffer Request 32.14% 1.424ms 32.14% 1.424ms 1.424ms 954.080us 33.77% 954.080us 954.080us 1 - aten::empty 0.63% 27.720us 0.63% 27.720us 4.620us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.12% 5.400us 0.12% 5.400us 1.800us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 0.77% 34.161us 0.77% 34.161us 11.387us 0.000us 0.00% 0.000us 0.000us 3 - aten::reshape 0.21% 9.370us 0.54% 23.750us 3.958us 0.000us 0.00% 0.000us 0.000us 6 - aten::view 0.32% 14.380us 0.32% 14.380us 2.397us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 54.04% 2.395ms 54.04% 2.395ms 2.395ms 0.000us 0.00% 0.000us 0.000us 1 + xformers_meff 6.56% 301.335us 45.12% 2.073ms 2.073ms 0.000us 0.00% 3.862ms 3.862ms 1 + xformers_flash3::flash_fwd 3.02% 138.865us 38.04% 1.748ms 582.607us 0.000us 0.00% 3.862ms 1.287ms 3 + flash_attn_3::fwd 1.15% 53.013us 35.02% 1.609ms 536.319us 2.932ms 100.00% 3.862ms 1.287ms 3 + xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.933ms 100.04% 2.933ms 2.933ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.932ms 100.00% 2.932ms 977.308us 3 + Activity Buffer Request 32.36% 1.487ms 32.36% 1.487ms 1.487ms 930.332us 31.73% 930.332us 930.332us 1 + aten::empty 0.65% 29.679us 0.65% 29.679us 4.946us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.12% 5.591us 0.12% 5.591us 1.864us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.74% 34.170us 0.74% 34.170us 11.390us 0.000us 0.00% 0.000us 0.000us 3 + aten::reshape 0.22% 9.881us 0.51% 23.631us 3.938us 0.000us 0.00% 0.000us 0.000us 6 + aten::view 0.30% 13.750us 0.30% 13.750us 2.292us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 54.88% 2.521ms 54.88% 2.521ms 2.521ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.431ms -Self CUDA time total: 2.825ms +Self CPU time total: 4.594ms +Self CUDA time total: 2.932ms @@ -3971,21 +3979,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L320_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - xformers_meff 6.87% 310.027us 44.72% 2.018ms 2.018ms 0.000us 0.00% 3.923ms 3.923ms 1 - xformers_flash3::flash_fwd 3.22% 145.444us 37.33% 1.684ms 561.324us 0.000us 0.00% 3.923ms 1.308ms 3 - flash_attn_3::fwd 1.15% 52.002us 34.10% 1.539ms 512.843us 2.919ms 100.00% 3.923ms 1.308ms 3 - xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.921ms 100.06% 2.921ms 2.921ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.919ms 100.00% 2.919ms 973.037us 3 - Activity Buffer Request 31.44% 1.418ms 31.44% 1.418ms 1.418ms 1.004ms 34.40% 1.004ms 1.004ms 1 - aten::empty 0.63% 28.392us 0.63% 28.392us 4.732us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.12% 5.520us 0.12% 5.520us 1.840us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 0.76% 34.420us 0.76% 34.420us 11.473us 0.000us 0.00% 0.000us 0.000us 3 - aten::reshape 0.21% 9.519us 0.52% 23.650us 3.942us 0.000us 0.00% 0.000us 0.000us 6 - aten::view 0.31% 14.131us 0.31% 14.131us 2.355us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 55.28% 2.494ms 55.28% 2.494ms 2.494ms 0.000us 0.00% 0.000us 0.000us 1 + xformers_meff 6.47% 295.057us 44.36% 2.024ms 2.024ms 0.000us 0.00% 3.906ms 3.906ms 1 + xformers_flash3::flash_fwd 3.08% 140.693us 37.39% 1.706ms 568.676us 0.000us 0.00% 3.906ms 1.302ms 3 + flash_attn_3::fwd 1.15% 52.641us 34.31% 1.565ms 521.779us 2.948ms 100.00% 3.906ms 1.302ms 3 + xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.949ms 100.05% 2.949ms 2.949ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.948ms 100.00% 2.948ms 982.658us 3 + Activity Buffer Request 31.65% 1.444ms 31.65% 1.444ms 1.444ms 958.263us 32.51% 958.263us 958.263us 1 + aten::empty 0.65% 29.440us 0.65% 29.440us 4.907us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.12% 5.511us 0.12% 5.511us 1.837us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.74% 33.911us 0.74% 33.911us 11.304us 0.000us 0.00% 0.000us 0.000us 3 + aten::reshape 0.18% 8.109us 0.50% 22.850us 3.808us 0.000us 0.00% 0.000us 0.000us 6 + aten::view 0.32% 14.741us 0.32% 14.741us 2.457us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 55.64% 2.539ms 55.64% 2.539ms 2.539ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.511ms -Self CUDA time total: 2.919ms +Self CPU time total: 4.562ms +Self CUDA time total: 2.948ms @@ -3995,21 +4003,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L384_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - xformers_meff 6.73% 317.798us 47.46% 2.241ms 2.241ms 0.000us 0.00% 3.892ms 3.892ms 1 - xformers_flash3::flash_fwd 3.10% 146.544us 40.23% 1.900ms 633.169us 0.000us 0.00% 3.892ms 1.297ms 3 - flash_attn_3::fwd 1.15% 54.462us 37.13% 1.753ms 584.321us 2.910ms 100.00% 3.892ms 1.297ms 3 - xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.911ms 100.05% 2.911ms 2.911ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.910ms 100.00% 2.910ms 969.848us 3 - Activity Buffer Request 30.01% 1.417ms 30.01% 1.417ms 1.417ms 982.915us 33.78% 982.915us 982.915us 1 - aten::empty 0.62% 29.170us 0.62% 29.170us 4.862us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.11% 5.370us 0.11% 5.370us 1.790us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 5.23% 247.156us 5.23% 247.156us 82.385us 0.000us 0.00% 0.000us 0.000us 3 - aten::reshape 0.20% 9.560us 0.50% 23.460us 3.910us 0.000us 0.00% 0.000us 0.000us 6 - aten::view 0.29% 13.900us 0.29% 13.900us 2.317us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 52.54% 2.481ms 52.54% 2.481ms 2.481ms 0.000us 0.00% 0.000us 0.000us 1 + xformers_meff 6.44% 300.857us 47.49% 2.217ms 2.217ms 0.000us 0.00% 3.827ms 3.827ms 1 + xformers_flash3::flash_fwd 3.16% 147.703us 40.53% 1.892ms 630.694us 0.000us 0.00% 3.827ms 1.276ms 3 + flash_attn_3::fwd 1.13% 52.820us 37.36% 1.744ms 581.460us 2.874ms 100.00% 3.827ms 1.276ms 3 + xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.876ms 100.05% 2.876ms 2.876ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.874ms 100.00% 2.874ms 958.161us 3 + Activity Buffer Request 30.85% 1.440ms 30.85% 1.440ms 1.440ms 952.124us 33.12% 952.124us 952.124us 1 + aten::empty 0.63% 29.391us 0.63% 29.391us 4.899us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.13% 5.930us 0.13% 5.930us 1.977us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 4.63% 215.955us 4.63% 215.955us 71.985us 0.000us 0.00% 0.000us 0.000us 3 + aten::reshape 0.22% 10.380us 0.51% 23.940us 3.990us 0.000us 0.00% 0.000us 0.000us 6 + aten::view 0.29% 13.560us 0.29% 13.560us 2.260us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 52.51% 2.452ms 52.51% 2.452ms 2.452ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.721ms -Self CUDA time total: 2.910ms +Self CPU time total: 4.669ms +Self CUDA time total: 2.874ms @@ -4019,21 +4027,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L448_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - xformers_meff 5.86% 306.369us 41.94% 2.193ms 2.193ms 0.000us 0.00% 4.614ms 4.614ms 1 - xformers_flash3::flash_fwd 2.85% 149.202us 35.63% 1.863ms 620.885us 0.000us 0.00% 4.614ms 1.538ms 3 - flash_attn_3::fwd 1.03% 53.951us 32.77% 1.713ms 571.151us 3.461ms 100.00% 4.614ms 1.538ms 3 - xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.462ms 100.04% 3.462ms 3.462ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.461ms 100.00% 3.461ms 1.154ms 3 - Activity Buffer Request 27.28% 1.426ms 27.28% 1.426ms 1.426ms 1.153ms 33.31% 1.153ms 1.153ms 1 - aten::empty 0.55% 28.813us 0.55% 28.813us 4.802us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.11% 5.560us 0.11% 5.560us 1.853us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 3.80% 198.684us 3.80% 198.684us 66.228us 0.000us 0.00% 0.000us 0.000us 3 - aten::reshape 0.18% 9.430us 0.46% 23.930us 3.988us 0.000us 0.00% 0.000us 0.000us 6 - aten::view 0.28% 14.500us 0.28% 14.500us 2.417us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 58.06% 3.036ms 58.06% 3.036ms 3.036ms 0.000us 0.00% 0.000us 0.000us 1 + xformers_meff 5.75% 298.955us 42.23% 2.194ms 2.194ms 0.000us 0.00% 4.560ms 4.560ms 1 + xformers_flash3::flash_fwd 2.73% 142.094us 36.04% 1.872ms 624.074us 0.000us 0.00% 4.560ms 1.520ms 3 + flash_attn_3::fwd 1.06% 54.881us 33.30% 1.730ms 576.710us 3.413ms 100.00% 4.560ms 1.520ms 3 + xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.415ms 100.04% 3.415ms 3.415ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.413ms 100.00% 3.413ms 1.138ms 3 + Activity Buffer Request 27.56% 1.432ms 27.56% 1.432ms 1.432ms 1.147ms 33.59% 1.147ms 1.147ms 1 + aten::empty 0.56% 28.860us 0.56% 28.860us 4.810us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.10% 5.420us 0.10% 5.420us 1.807us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 4.02% 208.865us 4.02% 208.865us 69.622us 0.000us 0.00% 0.000us 0.000us 3 + aten::reshape 0.18% 9.222us 0.44% 22.901us 3.817us 0.000us 0.00% 0.000us 0.000us 6 + aten::view 0.26% 13.679us 0.26% 13.679us 2.280us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 57.77% 3.001ms 57.77% 3.001ms 3.001ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 5.228ms -Self CUDA time total: 3.461ms +Self CPU time total: 5.196ms +Self CUDA time total: 3.413ms @@ -4043,37 +4051,83 @@ PROFILE TRACE: xformers_meff | cuda_attn_L512_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - xformers_meff 5.96% 310.158us 41.66% 2.167ms 2.167ms 0.000us 0.00% 4.643ms 4.643ms 1 - xformers_flash3::flash_fwd 2.83% 146.954us 35.22% 1.832ms 610.728us 0.000us 0.00% 4.643ms 1.548ms 3 - flash_attn_3::fwd 1.00% 51.911us 32.40% 1.685ms 561.744us 3.464ms 100.00% 4.643ms 1.548ms 3 - xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.465ms 100.04% 3.465ms 3.465ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.464ms 100.00% 3.464ms 1.155ms 3 - Activity Buffer Request 27.49% 1.430ms 27.49% 1.430ms 1.430ms 1.179ms 34.05% 1.179ms 1.179ms 1 - aten::empty 0.54% 28.311us 0.54% 28.311us 4.719us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.11% 5.750us 0.11% 5.750us 1.917us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 3.25% 169.084us 3.25% 169.084us 56.361us 0.000us 0.00% 0.000us 0.000us 3 - aten::reshape 0.17% 8.670us 0.48% 24.720us 4.120us 0.000us 0.00% 0.000us 0.000us 6 - aten::view 0.31% 16.050us 0.31% 16.050us 2.675us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 58.34% 3.035ms 58.34% 3.035ms 3.035ms 0.000us 0.00% 0.000us 0.000us 1 + xformers_meff 5.27% 272.556us 42.19% 2.184ms 2.184ms 0.000us 0.00% 4.536ms 4.536ms 1 + xformers_flash3::flash_fwd 2.70% 139.942us 36.49% 1.889ms 629.618us 0.000us 0.00% 4.536ms 1.512ms 3 + flash_attn_3::fwd 1.02% 52.981us 33.79% 1.749ms 582.970us 3.398ms 100.00% 4.536ms 1.512ms 3 + xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.400ms 100.05% 3.400ms 3.400ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.398ms 100.00% 3.398ms 1.133ms 3 + Activity Buffer Request 28.10% 1.454ms 28.10% 1.454ms 1.454ms 1.138ms 33.49% 1.138ms 1.138ms 1 + aten::empty 0.56% 28.991us 0.56% 28.991us 4.832us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.11% 5.511us 0.11% 5.511us 1.837us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 4.00% 207.225us 4.00% 207.225us 69.075us 0.000us 0.00% 0.000us 0.000us 3 + aten::reshape 0.17% 8.891us 0.44% 22.532us 3.755us 0.000us 0.00% 0.000us 0.000us 6 + aten::view 0.26% 13.641us 0.26% 13.641us 2.274us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 57.81% 2.992ms 57.81% 2.992ms 2.992ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 5.202ms -Self CUDA time total: 3.464ms +Self CPU time total: 5.176ms +Self CUDA time total: 3.398ms impl wl p50(ms) ok -xformers_meff cuda_attn_L128_bfloat16 1.00 True -xformers_meff cuda_attn_L256_bfloat16 1.03 True -xformers_meff cuda_attn_L320_bfloat16 1.08 True -xformers_meff cuda_attn_L384_bfloat16 1.09 True -xformers_meff cuda_attn_L448_bfloat16 1.25 True -xformers_meff cuda_attn_L512_bfloat16 1.24 True +xformers_meff cuda_attn_L128_bfloat16 0.99 True +xformers_meff cuda_attn_L256_bfloat16 1.05 True +xformers_meff cuda_attn_L320_bfloat16 1.06 True +xformers_meff cuda_attn_L384_bfloat16 1.06 True +xformers_meff cuda_attn_L448_bfloat16 1.23 True +xformers_meff cuda_attn_L512_bfloat16 1.23 True
▶ UV Install Logs
diff --git a/flash_attn/results/artifacts/combine/latency.svg b/flash_attn/results/artifacts/combine/latency.svg index 19f0903d77a8fb32c0a3ed03553c82706371801e..c02d0aa1d7f051dd29e4395a43dce26846a4ea67 100644 --- a/flash_attn/results/artifacts/combine/latency.svg +++ b/flash_attn/results/artifacts/combine/latency.svg @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0a7d7b3dc8fc6b60a4b9f8bfcf3e229706548b71a8174822b89cc9a2746d3bbd -size 24787 +oid sha256:567d9c3aecb5f005a8679995284fab5112829f643a670a3a2d3688588b305153 +size 24770 diff --git a/flash_attn/results/combined_results.html b/flash_attn/results/combined_results.html index 3a2204532e0ec8ef3588194f5c38935fb60f8208..0e9002191f5b7bbc1a144cc32a8bbb45ee055959 100644 --- a/flash_attn/results/combined_results.html +++ b/flash_attn/results/combined_results.html @@ -809,6 +809,14 @@ .artifact-preview svg { background: transparent; } + /* Invert SVG images in dark mode */ + :root[data-theme="dark"] .artifact-preview img[src$=".svg"] { + filter: invert(0.9) hue-rotate(180deg); + } + /* Keep SVG images readable in monocolor mode */ + :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] { + filter: none; + } /* CSV table styling */ .artifact-csv { margin-top: 1rem; @@ -3872,7 +3880,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: - 2025-10-29T14:28:03.109695 + 2025-10-29T15:51:09.340715 image/svg+xml @@ -3982,96 +3990,96 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: - + - + - 1.0 + 1.0 - + - + - 1.2 + 1.2 - + - + - 1.4 + 1.4 - + - + - 1.6 + 1.6 - + - + - 1.8 + 1.8 - + - + - 2.0 + 2.0 - + - + - 2.2 + 2.2 @@ -4079,73 +4087,73 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: - + - - - - - - + + + + + + - + - - - - - + + + + + - + - - - - - - + + + + + + - + - - - - - - + + + + + + - + - - - - - - + + + + + + @@ -4230,7 +4238,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: combine | 4.25s +Cell: combine | 4.24s | Raw @@ -4337,48 +4345,48 @@ Summary: 6 found, 0 skipped, 0 missing COMBINED BENCHMARK SUMMARY impl wl p50(ms) ok -hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.97 True -hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.01 True -hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.06 True -hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.09 True -hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.24 True -hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.23 True -hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.94 True -hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.97 True -hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.04 True -hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.05 True -hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.20 True -hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.18 True +hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.95 True +hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.00 True +hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.05 True +hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.07 True +hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.21 True +hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.19 True +hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 1.00 True +hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.99 True +hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.03 True +hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.02 True +hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.19 True +hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.15 True sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False - Error: module 'sage_attention_5c963cbdaf16559b' has no attribute 'fwd' + Error: module 'sage_attention_d4f4a6803f593c0b' has no attribute 'fwd' sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False - Error: module 'sage_attention_5c963cbdaf16559b' has no attribute 'fwd' + Error: module 'sage_attention_d4f4a6803f593c0b' has no attribute 'fwd' sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False - Error: module 'sage_attention_5c963cbdaf16559b' has no attribute 'fwd' + Error: module 'sage_attention_d4f4a6803f593c0b' has no attribute 'fwd' sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False - Error: module 'sage_attention_5c963cbdaf16559b' has no attribute 'fwd' + Error: module 'sage_attention_d4f4a6803f593c0b' has no attribute 'fwd' sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False - Error: module 'sage_attention_5c963cbdaf16559b' has no attribute 'fwd' + Error: module 'sage_attention_d4f4a6803f593c0b' has no attribute 'fwd' sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False - Error: module 'sage_attention_5c963cbdaf16559b' has no attribute 'fwd' -torch_flash_ma cuda_attn_L128_bfloat16 1.22 True -torch_flash_ma cuda_attn_L256_bfloat16 1.28 True -torch_flash_ma cuda_attn_L320_bfloat16 1.29 True -torch_flash_ma cuda_attn_L384_bfloat16 1.33 True -torch_flash_ma cuda_attn_L448_bfloat16 1.47 True -torch_flash_ma cuda_attn_L512_bfloat16 1.50 True + Error: module 'sage_attention_d4f4a6803f593c0b' has no attribute 'fwd' +torch_flash_ma cuda_attn_L128_bfloat16 1.21 True +torch_flash_ma cuda_attn_L256_bfloat16 1.27 True +torch_flash_ma cuda_attn_L320_bfloat16 1.30 True +torch_flash_ma cuda_attn_L384_bfloat16 1.32 True +torch_flash_ma cuda_attn_L448_bfloat16 1.48 True +torch_flash_ma cuda_attn_L512_bfloat16 1.49 True torch_mem_eff cuda_attn_L128_bfloat16 1.83 True -torch_mem_eff cuda_attn_L256_bfloat16 1.89 True -torch_mem_eff cuda_attn_L320_bfloat16 2.00 True -torch_mem_eff cuda_attn_L384_bfloat16 1.97 True -torch_mem_eff cuda_attn_L448_bfloat16 2.06 True +torch_mem_eff cuda_attn_L256_bfloat16 1.94 True +torch_mem_eff cuda_attn_L320_bfloat16 1.95 True +torch_mem_eff cuda_attn_L384_bfloat16 2.05 True +torch_mem_eff cuda_attn_L448_bfloat16 2.07 True torch_mem_eff cuda_attn_L512_bfloat16 2.19 True -xformers_meff cuda_attn_L128_bfloat16 1.00 True -xformers_meff cuda_attn_L256_bfloat16 1.03 True -xformers_meff cuda_attn_L320_bfloat16 1.08 True -xformers_meff cuda_attn_L384_bfloat16 1.09 True -xformers_meff cuda_attn_L448_bfloat16 1.25 True -xformers_meff cuda_attn_L512_bfloat16 1.24 True +xformers_meff cuda_attn_L128_bfloat16 0.99 True +xformers_meff cuda_attn_L256_bfloat16 1.05 True +xformers_meff cuda_attn_L320_bfloat16 1.06 True +xformers_meff cuda_attn_L384_bfloat16 1.06 True +xformers_meff cuda_attn_L448_bfloat16 1.23 True +xformers_meff cuda_attn_L512_bfloat16 1.23 True GENERATING COMBINED VISUALIZATION @@ -4402,7 +4410,7 @@ Implementations included:
▶ UV Install Logs
@@ -4415,7 +4423,7 @@ Installed 37 packages in 208ms - 2025-10-29T14:28:03.109695 + 2025-10-29T15:51:09.340715 image/svg+xml @@ -4525,96 +4533,96 @@ Installed 37 packages in 208ms - + - + - 1.0 + 1.0 - + - + - 1.2 + 1.2 - + - + - 1.4 + 1.4 - + - + - 1.6 + 1.6 - + - + - 1.8 + 1.8 - + - + - 2.0 + 2.0 - + - + - 2.2 + 2.2 @@ -4622,73 +4630,73 @@ Installed 37 packages in 208ms - + - - - - - - + + + + + + - + - - - - - + + + + + - + - - - - - - + + + + + + - + - - - - - - + + + + + + - + - - - - - - + + + + + + diff --git a/index.html b/index.html index 1061b4b3222caa3480fdd412bcf6f18bb97b54f9..c083b24d47b7d0832c42b35c69f8c3a33c6269c0 100644 --- a/index.html +++ b/index.html @@ -809,6 +809,14 @@ .artifact-preview svg { background: transparent; } + /* Invert SVG images in dark mode */ + :root[data-theme="dark"] .artifact-preview img[src$=".svg"] { + filter: invert(0.9) hue-rotate(180deg); + } + /* Keep SVG images readable in monocolor mode */ + :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] { + filter: none; + } /* CSV table styling */ .artifact-csv { margin-top: 1rem; @@ -3865,8 +3873,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:

All Benchmarks Aggregated Report

Layer Norm

- - +Layer Norm Latency
@@ -3889,8 +3896,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:

Rotary Position Embeddings

- - +Rotary Position Embeddings Latency
@@ -3913,8 +3919,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:

Flash Attention

- - +Flash Attention Latency
@@ -3953,8 +3958,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:

Causal Conv1D

- - +Causal Conv1D Latency
@@ -3977,8 +3981,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:

Activation

- - +Activation Latency
@@ -4001,8 +4004,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:

ReLU

- - +ReLU Latency
diff --git a/layer_norm/impls/artifacts/benchmark/layer_norm.jsonl b/layer_norm/impls/artifacts/benchmark/layer_norm.jsonl index 611975ecd9585a8b6f1198e5f9cf417087baa85d..adc647f4911ff8521ab72538ff2dcd6933790cd5 100644 --- a/layer_norm/impls/artifacts/benchmark/layer_norm.jsonl +++ b/layer_norm/impls/artifacts/benchmark/layer_norm.jsonl @@ -1,4 +1,4 @@ -{"ts": "2025-10-29T14:26:39Z", "run": "a26c482e532b445fa4192ef14ec4850f", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D4096", "batch": 16, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8322699999894212, "p50": 0.8364899999833142, "p90": 0.8382409999967422, "mean": 0.8359703999872181, "iqr": 0.0036810000096920703, "raw_times": [0.8322699999894212, 0.8382909999795629, 0.8345599999870501, 0.8382409999967422, 0.8364899999833142], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8381600000006983, "peak_bytes": 2415935488, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null} -{"ts": "2025-10-29T14:26:39Z", "run": "a26c482e532b445fa4192ef14ec4850f", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D8192", "batch": 16, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6435499999829517, "p50": 1.6499199999771008, "p90": 1.6516499999852385, "mean": 1.650240399987979, "iqr": 0.0024989999474200886, "raw_times": [1.6516499999852385, 1.6435499999829517, 1.6499199999771008, 1.6491510000378184, 1.656930999956785], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.644769999984419, "peak_bytes": 4831870976, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_ref"}, "err": null} -{"ts": "2025-10-29T14:26:39Z", "run": "a26c482e532b445fa4192ef14ec4850f", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D4096", "batch": 16, "seq_len": 4096, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6425610000396773, "p50": 1.6517310000381258, "p90": 1.654420999955164, "mean": 1.6505027999983213, "iqr": 0.006990999963818467, "raw_times": [1.6474299999913455, 1.6517310000381258, 1.654420999955164, 1.6563709999672938, 1.6425610000396773], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6471609999939574, "peak_bytes": 4831854592, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null} -{"ts": "2025-10-29T14:26:39Z", "run": "a26c482e532b445fa4192ef14ec4850f", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D8192", "batch": 16, "seq_len": 4096, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.237169999977141, "p50": 3.2576509999557857, "p90": 3.264301000001524, "mean": 3.257706599993071, "iqr": 0.008230999981151399, "raw_times": [3.2576509999557857, 3.264301000001524, 3.2733410000105323, 3.2560700000203724, 3.237169999977141], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.2725309999932506, "peak_bytes": 9663709184, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1026859283447266e-05, "ref": "layer_norm_ref"}, "err": null} +{"ts": "2025-10-29T15:50:43Z", "run": "146df378aaa14293b5d1526656fc6cb4", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D4096", "batch": 16, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8137589999819284, "p50": 0.8219090000238793, "p90": 0.8223789999988185, "mean": 0.8196492000138278, "iqr": 0.007259999961206631, "raw_times": [0.825080000026901, 0.8219090000238793, 0.8137589999819284, 0.8223789999988185, 0.8151190000376118], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8213489999775447, "peak_bytes": 2415935488, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null} +{"ts": "2025-10-29T15:50:43Z", "run": "146df378aaa14293b5d1526656fc6cb4", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D8192", "batch": 16, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.680888999999297, "p50": 1.6820789999769659, "p90": 1.6842590000010205, "mean": 1.683131400000093, "iqr": 0.0026189999857706425, "raw_times": [1.6820789999769659, 1.680888999999297, 1.6816400000152498, 1.6842590000010205, 1.6867900000079317], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.687689999982922, "peak_bytes": 4831870976, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_ref"}, "err": null} +{"ts": "2025-10-29T15:50:43Z", "run": "146df378aaa14293b5d1526656fc6cb4", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D4096", "batch": 16, "seq_len": 4096, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.603787000021839, "p50": 1.6093779999550861, "p90": 1.6102179999961663, "mean": 1.6086159999986194, "iqr": 0.002069999993636884, "raw_times": [1.6093779999550861, 1.6081480000025294, 1.603787000021839, 1.611549000017476, 1.6102179999961663], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6238279999924998, "peak_bytes": 4831854592, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null} +{"ts": "2025-10-29T15:50:44Z", "run": "146df378aaa14293b5d1526656fc6cb4", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D8192", "batch": 16, "seq_len": 4096, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.30805800001599, "p50": 3.3301390000133324, "p90": 3.331328999991001, "mean": 3.3278527999868857, "iqr": 0.001610000026630587, "raw_times": [3.331328999991001, 3.3400189999497343, 3.3297189999643706, 3.30805800001599, 3.3301390000133324], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.3235790000389898, "peak_bytes": 9663709184, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1026859283447266e-05, "ref": "layer_norm_ref"}, "err": null} diff --git a/layer_norm/impls/cells/benchmark.py b/layer_norm/impls/cells/benchmark.py index d871d1b25fedf8b294c567e9ac582decb62f3cde..6a00a9f99d8d044ab5f9dc0f5019344cef0612b9 100644 --- a/layer_norm/impls/cells/benchmark.py +++ b/layer_norm/impls/cells/benchmark.py @@ -3,7 +3,6 @@ # dependencies = [ # "numpy", # "torch==2.8.0", -# "kernels", # "kernels-benchmark-tools", # ] # @@ -13,37 +12,15 @@ import torch import sys from kernels_benchmark_tools import KernelTypeEnum, run_benchmark -from kernels import get_kernel -# Load the layer norm kernel -layer_norm_kernel = get_kernel("kernels-community/layer-norm") - -def hf_kernels_layer_norm(x, weight, bias, eps: float = 1e-5): - B, S, D = x.shape - # The kernel expects [N, D] input; support beta (bias) if provided. - out = layer_norm_kernel.dropout_add_ln_fwd( - input=x.view(-1, D), - gamma=weight, - beta=bias, - rowscale=None, - colscale=None, - x0_subset=None, - z_subset=None, - dropout_p=0.0, - epsilon=eps, - rowscale_const=1.0, - z_numrows=S, - gen=None, - residual_in_fp32=False, - is_rms_norm=False, - )[0].view(B, S, D) - return out +def torch_layer_norm(x, weight, bias, eps: float = 1e-5): + return torch.nn.functional.layer_norm(x, (x.shape[-1],), weight, bias, eps) run_benchmark( kernel_type=KernelTypeEnum.LAYER_NORM, - impl_name="hf_kernels_layer_norm", - impl_tags={"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, - impl_func=hf_kernels_layer_norm, + impl_name="torch_layer_norm", + impl_tags={"family": "torch", "op": "layer_norm"}, + impl_func=torch_layer_norm, ) \ No newline at end of file diff --git a/layer_norm/impls/hf_kernels_layer_norm.html b/layer_norm/impls/hf_kernels_layer_norm.html index 9e9cf8da940eb80e201b94351f6e97b42048c103..4563b98410b2fc06b9c1913c549409aa9aa56f34 100644 --- a/layer_norm/impls/hf_kernels_layer_norm.html +++ b/layer_norm/impls/hf_kernels_layer_norm.html @@ -809,6 +809,14 @@ .artifact-preview svg { background: transparent; } + /* Invert SVG images in dark mode */ + :root[data-theme="dark"] .artifact-preview img[src$=".svg"] { + filter: invert(0.9) hue-rotate(180deg); + } + /* Keep SVG images readable in monocolor mode */ + :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] { + filter: none; + } /* CSV table styling */ .artifact-csv { margin-top: 1rem; @@ -3872,7 +3880,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output▶ uv-logs | -Cell: benchmark | 6.34s +Cell: benchmark | 9.83s | Raw @@ -3943,19 +3951,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D4096 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 5.26% 209.855us 46.73% 1.864ms 1.864ms 0.000us 0.00% 3.097ms 3.097ms 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 1.78% 70.832us 40.86% 1.630ms 543.337us 2.360ms 100.00% 3.097ms 1.032ms 3 - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.362ms 100.06% 2.362ms 2.362ms 1 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 2.360ms 100.00% 2.360ms 786.699us 3 - Activity Buffer Request 36.61% 1.461ms 36.61% 1.461ms 1.461ms 736.736us 31.22% 736.736us 736.736us 1 - aten::view 0.61% 24.271us 0.61% 24.271us 4.045us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 1.19% 47.642us 1.19% 47.642us 5.294us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.27% 10.789us 0.27% 10.789us 3.596us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 1.01% 40.102us 1.01% 40.102us 13.367us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 53.27% 2.125ms 53.27% 2.125ms 2.125ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 4.95% 198.743us 46.81% 1.878ms 1.878ms 0.000us 0.00% 3.111ms 3.111ms 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 1.73% 69.535us 41.21% 1.653ms 550.933us 2.375ms 100.00% 3.111ms 1.037ms 3 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.376ms 100.07% 2.376ms 2.376ms 1 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 2.375ms 100.00% 2.375ms 791.590us 3 + Activity Buffer Request 36.98% 1.483ms 36.98% 1.483ms 1.483ms 736.636us 31.02% 736.636us 736.636us 1 + aten::view 0.65% 26.132us 0.65% 26.132us 4.355us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 1.22% 49.009us 1.22% 49.009us 5.445us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.22% 8.769us 0.22% 8.769us 2.923us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 1.05% 42.291us 1.05% 42.291us 14.097us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 53.19% 2.133ms 53.19% 2.133ms 2.133ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 3.989ms -Self CUDA time total: 2.360ms +Self CPU time total: 4.011ms +Self CUDA time total: 2.375ms @@ -3965,19 +3973,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D8192 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 2.24% 143.733us 27.27% 1.751ms 1.751ms 0.000us 0.00% 6.440ms 6.440ms 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 0.75% 48.181us 24.84% 1.595ms 531.669us 4.846ms 100.00% 6.440ms 2.147ms 3 - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.848ms 100.03% 4.848ms 4.848ms 1 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.846ms 100.00% 4.846ms 1.615ms 3 - Activity Buffer Request 23.08% 1.482ms 23.08% 1.482ms 1.482ms 1.594ms 32.88% 1.594ms 1.594ms 1 - aten::view 0.20% 12.572us 0.20% 12.572us 2.095us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 0.46% 29.840us 0.46% 29.840us 3.316us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 5.420us 0.08% 5.420us 1.807us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 0.46% 29.490us 0.46% 29.490us 9.830us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 72.73% 4.670ms 72.73% 4.670ms 4.670ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 1.97% 125.105us 26.88% 1.705ms 1.705ms 0.000us 0.00% 6.375ms 6.375ms 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 0.73% 46.170us 24.73% 1.568ms 522.755us 4.809ms 100.00% 6.375ms 2.125ms 3 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.811ms 100.03% 4.811ms 4.811ms 1 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.809ms 100.00% 4.809ms 1.603ms 3 + Activity Buffer Request 22.98% 1.457ms 22.98% 1.457ms 1.457ms 1.565ms 32.55% 1.565ms 1.565ms 1 + aten::view 0.18% 11.529us 0.18% 11.529us 1.922us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 0.46% 29.430us 0.46% 29.430us 3.270us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 4.900us 0.08% 4.900us 1.633us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.48% 30.441us 0.48% 30.441us 10.147us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 73.12% 4.638ms 73.12% 4.638ms 4.638ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 6.421ms -Self CUDA time total: 4.846ms +Self CPU time total: 6.343ms +Self CUDA time total: 4.809ms @@ -3987,19 +3995,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D4096 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 1.96% 126.465us 27.43% 1.766ms 1.766ms 0.000us 0.00% 6.435ms 6.435ms 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 0.73% 46.779us 25.26% 1.627ms 542.360us 4.838ms 100.00% 6.435ms 2.145ms 3 - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.839ms 100.03% 4.839ms 4.839ms 1 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.838ms 100.00% 4.838ms 1.613ms 3 - Activity Buffer Request 23.54% 1.516ms 23.54% 1.516ms 1.516ms 1.597ms 33.01% 1.597ms 1.597ms 1 - aten::view 0.20% 12.929us 0.20% 12.929us 2.155us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 0.46% 29.911us 0.46% 29.911us 3.323us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 5.300us 0.08% 5.300us 1.767us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 0.45% 29.003us 0.45% 29.003us 9.668us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 72.57% 4.674ms 72.57% 4.674ms 4.674ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 1.75% 110.793us 26.94% 1.702ms 1.702ms 0.000us 0.00% 6.331ms 6.331ms 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 0.70% 44.248us 25.01% 1.580ms 526.532us 4.779ms 100.00% 6.331ms 2.110ms 3 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.781ms 100.03% 4.781ms 4.781ms 1 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.779ms 100.00% 4.779ms 1.593ms 3 + Activity Buffer Request 23.30% 1.472ms 23.30% 1.472ms 1.472ms 1.552ms 32.48% 1.552ms 1.552ms 1 + aten::view 0.18% 11.190us 0.18% 11.190us 1.865us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 0.49% 30.823us 0.49% 30.823us 3.425us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 4.981us 0.08% 4.981us 1.660us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.44% 28.031us 0.44% 28.031us 9.344us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 73.06% 4.615ms 73.06% 4.615ms 4.615ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 6.440ms -Self CUDA time total: 4.838ms +Self CPU time total: 6.317ms +Self CUDA time total: 4.779ms @@ -4009,24 +4017,24 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D8192 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_layer_norm 1.17% 134.085us 17.09% 1.957ms 1.957ms 0.000us 0.00% 12.886ms 12.886ms 1 - _layer_norm_f8ec252::dropout_add_ln_fwd 0.41% 46.869us 15.80% 1.809ms 603.015us 9.665ms 100.00% 12.886ms 4.295ms 3 - hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.667ms 100.01% 9.667ms 9.667ms 1 -void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 9.665ms 100.00% 9.665ms 3.222ms 3 - Activity Buffer Request 12.76% 1.462ms 12.76% 1.462ms 1.462ms 3.220ms 33.32% 3.220ms 3.220ms 1 - aten::view 0.12% 13.968us 0.12% 13.968us 2.328us 0.000us 0.00% 0.000us 0.000us 6 - aten::empty 0.26% 30.043us 0.26% 30.043us 3.338us 0.000us 0.00% 0.000us 0.000us 9 -cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.05% 5.590us 0.05% 5.590us 1.863us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 2.31% 264.797us 2.31% 264.797us 88.266us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 82.91% 9.495ms 82.91% 9.495ms 9.495ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_layer_norm 1.11% 111.882us 6.14% 619.354us 619.354us 0.000us 0.00% 12.808ms 12.808ms 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 0.46% 46.119us 4.92% 496.462us 165.487us 9.625ms 100.00% 12.808ms 4.269ms 3 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.626ms 100.01% 9.626ms 9.626ms 1 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 9.625ms 100.00% 9.625ms 3.208ms 3 + Activity Buffer Request 1.38% 138.943us 1.38% 138.943us 138.943us 3.183ms 33.07% 3.183ms 3.183ms 1 + aten::view 0.11% 11.010us 0.11% 11.010us 1.835us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 0.31% 31.174us 0.31% 31.174us 3.464us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.05% 5.190us 0.05% 5.190us 1.730us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 2.73% 275.036us 2.73% 275.036us 91.679us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 93.86% 9.465ms 93.86% 9.465ms 9.465ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 11.452ms -Self CUDA time total: 9.665ms +Self CPU time total: 10.085ms +Self CUDA time total: 9.625ms impl wl p50(ms) ok -hf_kernels_layer_norm LN_B16_S2048_D4096 0.84 True -hf_kernels_layer_norm LN_B16_S2048_D8192 1.65 True +hf_kernels_layer_norm LN_B16_S2048_D4096 0.83 True +hf_kernels_layer_norm LN_B16_S2048_D8192 1.66 True hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True hf_kernels_layer_norm LN_B16_S4096_D8192 3.26 True @@ -4035,12 +4043,12 @@ hf_kernels_layer_norm LN_B16_S4096_D8192 3.26 True
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s] -Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.22it/s] -Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.44it/s]
+Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.30it/s] +Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.60it/s]

Artifacts:

layer_norm.jsonl diff --git a/layer_norm/impls/torch_layer_norm.html b/layer_norm/impls/torch_layer_norm.html index f5dd45a5ed15040ec9f80c48eca459fb67a1bc56..0f90777951b3966fba559a5ff047858e89d99906 100644 --- a/layer_norm/impls/torch_layer_norm.html +++ b/layer_norm/impls/torch_layer_norm.html @@ -809,6 +809,14 @@ .artifact-preview svg { background: transparent; } + /* Invert SVG images in dark mode */ + :root[data-theme="dark"] .artifact-preview img[src$=".svg"] { + filter: invert(0.9) hue-rotate(180deg); + } + /* Keep SVG images readable in monocolor mode */ + :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] { + filter: none; + } /* CSV table styling */ .artifact-csv { margin-top: 1rem; @@ -3871,7 +3879,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: nv | 0.26s +Cell: nv | 0.23s | Raw @@ -3887,7 +3895,7 @@ Cell: nv | 0.26s
-
Wed Oct 29 14:26:26 2025       
+
Wed Oct 29 15:50:44 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -3896,7 +3904,7 @@ Cell: nv | 0.26s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   30C    P0            108W /  350W |       0MiB /  46068MiB |    100%      Default |
+| N/A   29C    P0            138W /  350W |       0MiB /  46068MiB |     49%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3918,9 +3926,9 @@ Cell: nv | 0.26s
 
 ▼ code 
 ▼ output
- ▶ uv-logs
+ ▶ uv-logs
  | 
-Cell: benchmark | 7.36s
+Cell: benchmark | 3.85s
  | 
 
 Raw
@@ -3968,19 +3976,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S2048_D4096
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         3.90%     151.572us        46.01%       1.786ms       1.786ms       0.000us         0.00%       3.026ms       3.026ms             1  
-                                       aten::layer_norm         0.43%      16.762us        42.11%       1.635ms     544.851us       0.000us         0.00%       3.026ms       1.009ms             3  
-                                aten::native_layer_norm         2.06%      80.009us        41.67%       1.618ms     539.263us       2.316ms       100.00%       3.026ms       1.009ms             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       2.318ms       100.06%       2.318ms       2.318ms             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       2.316ms       100.00%       2.316ms     772.127us             3  
-                                Activity Buffer Request        37.08%       1.440ms        37.08%       1.440ms       1.440ms     709.855us        30.65%     709.855us     709.855us             1  
-                                            aten::empty         1.19%      46.261us         1.19%      46.261us       5.140us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         1.16%      45.163us         1.16%      45.163us      15.054us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.17%       6.761us         0.17%       6.761us       1.127us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        53.99%       2.096ms        53.99%       2.096ms       2.096ms       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         3.59%     140.394us        45.88%       1.793ms       1.793ms       0.000us         0.00%       3.034ms       3.034ms             1  
+                                       aten::layer_norm         0.43%      16.891us        42.29%       1.653ms     551.033us       0.000us         0.00%       3.034ms       1.011ms             3  
+                                aten::native_layer_norm         2.49%      97.515us        41.85%       1.636ms     545.403us       2.324ms       100.00%       3.034ms       1.011ms             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       2.325ms       100.07%       2.325ms       2.325ms             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       2.324ms       100.00%       2.324ms     774.631us             3  
+                                Activity Buffer Request        36.92%       1.443ms        36.92%       1.443ms       1.443ms     709.916us        30.55%     709.916us     709.916us             1  
+                                            aten::empty         1.11%      43.309us         1.11%      43.309us       4.812us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         1.17%      45.620us         1.17%      45.620us      15.207us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.17%       6.600us         0.17%       6.600us       1.100us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        54.12%       2.116ms        54.12%       2.116ms       2.116ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 3.882ms
-Self CUDA time total: 2.316ms
+Self CPU time total: 3.909ms
+Self CUDA time total: 2.324ms
 
 
 
@@ -3990,19 +3998,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S2048_D8192
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         1.19%      75.581us        25.55%       1.628ms       1.628ms       0.000us         0.00%       6.473ms       6.473ms             1  
-                                       aten::layer_norm         0.14%       9.142us        24.37%       1.553ms     517.550us       0.000us         0.00%       6.473ms       2.158ms             3  
-                                aten::native_layer_norm         0.81%      51.921us        24.22%       1.544ms     514.502us       4.881ms       100.00%       6.473ms       2.158ms             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       4.882ms       100.03%       4.882ms       4.882ms             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       4.881ms       100.00%       4.881ms       1.627ms             3  
-                                Activity Buffer Request        22.46%       1.431ms        22.46%       1.431ms       1.431ms       1.592ms        32.61%       1.592ms       1.592ms             1  
-                                            aten::empty         0.44%      27.841us         0.44%      27.841us       3.093us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         0.45%      28.910us         0.45%      28.910us       9.637us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.06%       3.829us         0.06%       3.829us       0.638us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        74.45%       4.743ms        74.45%       4.743ms       4.743ms       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         1.51%      96.533us        25.68%       1.646ms       1.646ms       0.000us         0.00%       6.506ms       6.506ms             1  
+                                       aten::layer_norm         0.14%       9.019us        24.18%       1.550ms     516.535us       0.000us         0.00%       6.506ms       2.169ms             3  
+                                aten::native_layer_norm         0.81%      51.783us        24.04%       1.541ms     513.529us       4.903ms       100.00%       6.506ms       2.169ms             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       4.905ms       100.03%       4.905ms       4.905ms             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       4.903ms       100.00%       4.903ms       1.634ms             3  
+                                Activity Buffer Request        22.28%       1.428ms        22.28%       1.428ms       1.428ms       1.602ms        32.68%       1.602ms       1.602ms             1  
+                                            aten::empty         0.45%      29.001us         0.45%      29.001us       3.222us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         0.43%      27.850us         0.43%      27.850us       9.283us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.07%       4.220us         0.07%       4.220us       0.703us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        74.32%       4.763ms        74.32%       4.763ms       4.763ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 6.372ms
-Self CUDA time total: 4.881ms
+Self CPU time total: 6.409ms
+Self CUDA time total: 4.903ms
 
 
 
@@ -4012,19 +4020,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S4096_D4096
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         1.15%      71.882us        26.71%       1.668ms       1.668ms       0.000us         0.00%       6.222ms       6.222ms             1  
-                                       aten::layer_norm         0.15%       9.629us        25.56%       1.596ms     532.153us       0.000us         0.00%       6.222ms       2.074ms             3  
-                                aten::native_layer_norm         0.90%      56.373us        25.41%       1.587ms     528.943us       4.717ms       100.00%       6.222ms       2.074ms             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       4.718ms       100.03%       4.718ms       4.718ms             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       4.717ms       100.00%       4.717ms       1.572ms             3  
-                                Activity Buffer Request        23.44%       1.464ms        23.44%       1.464ms       1.464ms       1.506ms        31.93%       1.506ms       1.506ms             1  
-                                            aten::empty         0.46%      28.850us         0.46%      28.850us       3.206us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         0.52%      32.781us         0.52%      32.781us      10.927us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.07%       4.590us         0.07%       4.590us       0.765us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        73.29%       4.577ms        73.29%       4.577ms       4.577ms       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         1.49%      93.320us        26.51%       1.656ms       1.656ms       0.000us         0.00%       6.235ms       6.235ms             1  
+                                       aten::layer_norm         0.15%       9.262us        25.02%       1.563ms     520.876us       0.000us         0.00%       6.235ms       2.078ms             3  
+                                aten::native_layer_norm         0.82%      51.181us        24.87%       1.553ms     517.789us       4.722ms       100.00%       6.235ms       2.078ms             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       4.723ms       100.03%       4.723ms       4.723ms             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       4.722ms       100.00%       4.722ms       1.574ms             3  
+                                Activity Buffer Request        23.09%       1.443ms        23.09%       1.443ms       1.443ms       1.513ms        32.04%       1.513ms       1.513ms             1  
+                                            aten::empty         0.46%      28.530us         0.46%      28.530us       3.170us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         0.44%      27.431us         0.44%      27.431us       9.144us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.06%       3.670us         0.06%       3.670us       0.612us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        73.49%       4.591ms        73.49%       4.591ms       4.591ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 6.246ms
-Self CUDA time total: 4.717ms
+Self CPU time total: 6.247ms
+Self CUDA time total: 4.722ms
 
 
 
@@ -4034,19 +4042,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S4096_D8192
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         0.67%      74.340us        13.35%       1.490ms       1.490ms       0.000us         0.00%      13.028ms      13.028ms             1  
-                                       aten::layer_norm         0.09%       9.510us        12.69%       1.416ms     471.835us       0.000us         0.00%      13.028ms       4.343ms             3  
-                                aten::native_layer_norm         0.47%      52.269us        12.60%       1.406ms     468.665us       9.808ms       100.00%      13.028ms       4.343ms             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       9.809ms       100.02%       9.809ms       9.809ms             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       9.808ms       100.00%       9.808ms       3.269ms             3  
-                                Activity Buffer Request         9.72%       1.085ms         9.72%       1.085ms       1.085ms       3.220ms        32.83%       3.220ms       3.220ms             1  
-                                            aten::empty         0.26%      29.181us         0.26%      29.181us       3.242us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         2.11%     235.817us         2.11%     235.817us      78.606us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.04%       4.022us         0.04%       4.022us       0.670us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        86.65%       9.669ms        86.65%       9.669ms       9.669ms       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         0.75%      86.532us        16.17%       1.873ms       1.873ms       0.000us         0.00%      13.086ms      13.086ms             1  
+                                       aten::layer_norm         0.08%       9.721us        15.43%       1.787ms     595.631us       0.000us         0.00%      13.086ms       4.362ms             3  
+                                aten::native_layer_norm         0.46%      53.132us        15.34%       1.777ms     592.390us       9.848ms       100.00%      13.086ms       4.362ms             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       9.850ms       100.01%       9.850ms       9.850ms             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       9.848ms       100.00%       9.848ms       3.283ms             3  
+                                Activity Buffer Request        12.61%       1.460ms        12.61%       1.460ms       1.460ms       3.238ms        32.88%       3.238ms       3.238ms             1  
+                                            aten::empty         0.27%      30.840us         0.27%      30.840us       3.427us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         1.98%     229.105us         1.98%     229.105us      76.368us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.03%       3.969us         0.03%       3.969us       0.661us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        83.83%       9.710ms        83.83%       9.710ms       9.710ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 11.159ms
-Self CUDA time total: 9.808ms
+Self CPU time total: 11.583ms
+Self CUDA time total: 9.848ms
 
 
 impl                     wl                  p50(ms)  ok
@@ -4055,12 +4063,6 @@ torch_layer_norm         LN_B16_S2048_D8192     1.68  True
 torch_layer_norm         LN_B16_S4096_D4096     1.61  True
 torch_layer_norm         LN_B16_S4096_D8192     3.33  True
 
-
-
▶ UV Install Logs
- -

Artifacts:

layer_norm.jsonl diff --git a/layer_norm/results/artifacts/combine/latency.svg b/layer_norm/results/artifacts/combine/latency.svg index c17ece602ed5ebc325bf99b71237b08ca31fbe89..947f78ebb84373548acf97ad8039350c03e2aa29 100644 --- a/layer_norm/results/artifacts/combine/latency.svg +++ b/layer_norm/results/artifacts/combine/latency.svg @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8fd53794c4617f7e947676c655de6f739b720b8f16a59432369c127bfc08190a -size 14644 +oid sha256:9bbb6ba8f80ad7d025abae8130bb65dedc3691b259d1e31011653d588f2a3243 +size 14645 diff --git a/layer_norm/results/combined_results.html b/layer_norm/results/combined_results.html index 5a42e66a6787e88853b7090c03ba6d4a8cd04457..0c0b50ef9540ffebc0a0482c4da77f51aae55517 100644 --- a/layer_norm/results/combined_results.html +++ b/layer_norm/results/combined_results.html @@ -809,6 +809,14 @@ .artifact-preview svg { background: transparent; } + /* Invert SVG images in dark mode */ + :root[data-theme="dark"] .artifact-preview img[src$=".svg"] { + filter: invert(0.9) hue-rotate(180deg); + } + /* Keep SVG images readable in monocolor mode */ + :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] { + filter: none; + } /* CSV table styling */ .artifact-csv { margin-top: 1rem; @@ -3872,7 +3880,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: - 2025-10-29T14:27:45.722521 + 2025-10-29T15:51:05.081730 image/svg+xml @@ -3956,70 +3964,70 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: - + - + - 1.0 + 1.0 - + - + - 1.5 + 1.5 - + - + - 2.0 + 2.0 - + - + - 2.5 + 2.5 - + - + - 3.0 + 3.0 @@ -4027,27 +4035,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: - + - - + + - + - - - - + + + + @@ -4105,7 +4113,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: combine | 4.21s +Cell: combine | 4.18s | Raw @@ -4192,8 +4200,8 @@ Summary: 2 found, 0 skipped, 0 missing COMBINED BENCHMARK SUMMARY impl wl p50(ms) ok -hf_kernels_layer_norm LN_B16_S2048_D4096 0.84 True -hf_kernels_layer_norm LN_B16_S2048_D8192 1.65 True +hf_kernels_layer_norm LN_B16_S2048_D4096 0.83 True +hf_kernels_layer_norm LN_B16_S2048_D8192 1.66 True hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True hf_kernels_layer_norm LN_B16_S4096_D8192 3.26 True torch_layer_norm LN_B16_S2048_D4096 0.82 True @@ -4219,7 +4227,7 @@ Implementations included:
▶ UV Install Logs
@@ -4232,7 +4240,7 @@ Installed 37 packages in 210ms - 2025-10-29T14:27:45.722521 + 2025-10-29T15:51:05.081730 image/svg+xml @@ -4316,70 +4324,70 @@ Installed 37 packages in 210ms - + - + - 1.0 + 1.0 - + - + - 1.5 + 1.5 - + - + - 2.0 + 2.0 - + - + - 2.5 + 2.5 - + - + - 3.0 + 3.0 @@ -4387,27 +4395,27 @@ Installed 37 packages in 210ms - + - - + + - + - - - - + + + + diff --git a/rotary/impls/artifacts/benchmark/rotary.jsonl b/rotary/impls/artifacts/benchmark/rotary.jsonl index e407db0807eb78b1db05edcb765f594b555812aa..87d08b7792512bccbf803e823bcfc5a0155d29eb 100644 --- a/rotary/impls/artifacts/benchmark/rotary.jsonl +++ b/rotary/impls/artifacts/benchmark/rotary.jsonl @@ -1,24 +1,24 @@ -{"ts": "2025-10-29T14:27:29Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H8_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.17006399997399058, "p50": 0.17533400000502297, "p90": 0.1853339999797754, "mean": 0.1802961999942454, "iqr": 0.014799999974002276, "raw_times": [0.17533400000502297, 0.20021500000666492, 0.17053400000577312, 0.1853339999797754, 0.17006399997399058], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.18331500001522727, "peak_bytes": 1720320, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-29T14:27:29Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H8_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2246159999685915, "p50": 0.2266160000203854, "p90": 0.22888500001272405, "mean": 0.22735560000910482, "iqr": 0.002880000010918593, "raw_times": [0.22600500000180546, 0.22888500001272405, 0.2246159999685915, 0.2266160000203854, 0.2306560000420177], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2471160000254713, "peak_bytes": 3440640, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-29T14:27:29Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H32_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21813499995460006, "p50": 0.22189599997091136, "p90": 0.2272149999953399, "mean": 0.22315939997952228, "iqr": 0.007960000004914036, "raw_times": [0.2272149999953399, 0.22189599997091136, 0.21925499999042586, 0.21813499995460006, 0.2292959999863342], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2391049999914685, "peak_bytes": 6832128, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-29T14:27:29Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H32_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21478600001501036, "p50": 0.21544499998071842, "p90": 0.2178249999928994, "mean": 0.2161891999890031, "iqr": 0.0027799999884337012, "raw_times": [0.21544499998071842, 0.2178249999928994, 0.21478600001501036, 0.21784499995192164, 0.2150450000044657], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22814599998355334, "peak_bytes": 13664256, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-29T14:27:29Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H8_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2160950000416051, "p50": 0.22390499998437008, "p90": 0.22473600000694205, "mean": 0.22559540000202105, "iqr": 0.00507100003233063, "raw_times": [0.22390499998437008, 0.24357600000257662, 0.22473600000694205, 0.2160950000416051, 0.21966499997461142], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.229085999990275, "peak_bytes": 6881280, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H8_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21366499998975996, "p50": 0.21597500000325454, "p90": 0.21670500001391702, "mean": 0.2158129999884295, "iqr": 0.0008600000569458643, "raw_times": [0.21366499998975996, 0.2168749999782449, 0.21597500000325454, 0.21670500001391702, 0.21584499995697115], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21572499997546402, "peak_bytes": 13762560, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H32_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2142449999951168, "p50": 0.21574499999132968, "p90": 0.2169850000086626, "mean": 0.21585539999477987, "iqr": 0.0022990000161371427, "raw_times": [0.2142449999951168, 0.2176159999862648, 0.21468599999252547, 0.2169850000086626, 0.21574499999132968], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2192349999745602, "peak_bytes": 27328512, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H32_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21675499999673775, "p50": 0.21711599998752718, "p90": 0.21833499999956985, "mean": 0.2174776000060774, "iqr": 0.0015789999565640755, "raw_times": [0.21675499999673775, 0.21711599998752718, 0.21833499999956985, 0.21675600004300577, 0.2184260000035465], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22064600000248902, "peak_bytes": 54657024, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H8_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2153649999740992, "p50": 0.21702599997297511, "p90": 0.21829499996783852, "mean": 0.21729759998834197, "iqr": 0.0014989999499448459, "raw_times": [0.2153649999740992, 0.21679600001789368, 0.21900600000890336, 0.21702599997297511, 0.21829499996783852], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22154499998805477, "peak_bytes": 27525120, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H8_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2143060000321384, "p50": 0.2161449999675824, "p90": 0.21640500000330576, "mean": 0.21578740000904872, "iqr": 0.0008589999538344273, "raw_times": [0.21653499999274572, 0.21640500000330576, 0.2143060000321384, 0.2161449999675824, 0.21554600004947133], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23455599995259035, "peak_bytes": 55050240, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H32_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21488499999122723, "p50": 0.21633500000461936, "p90": 0.21918499999173946, "mean": 0.21730919999072285, "iqr": 0.004300000000512227, "raw_times": [0.21488499999122723, 0.21918499999173946, 0.21488499999122723, 0.22125599997480094, 0.21633500000461936], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2185359999771208, "peak_bytes": 109314048, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H32_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2230359999657594, "p50": 0.22526600002947816, "p90": 0.22695600000588456, "mean": 0.22723160000168718, "iqr": 0.0026509999884183344, "raw_times": [0.22526600002947816, 0.2230359999657594, 0.23659499998984757, 0.22430500001746623, 0.22695600000588456], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22456599998577076, "peak_bytes": 218628096, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H8_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21608499997682884, "p50": 0.2175149999743553, "p90": 0.22948600002337116, "mean": 0.2247094000040306, "iqr": 0.012610999988282856, "raw_times": [0.21608499997682884, 0.2168750000350883, 0.22948600002337116, 0.24358600001050945, 0.2175149999743553], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21851499997183055, "peak_bytes": 68698112, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H8_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2162149999662688, "p50": 0.21694499997693129, "p90": 0.2171250000060354, "mean": 0.21706100000074002, "iqr": 0.0003099999617006688, "raw_times": [0.2162149999662688, 0.21694499997693129, 0.2171250000060354, 0.21820500001012988, 0.21681500004433474], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21809500003655558, "peak_bytes": 6848512, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H32_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2136749999976928, "p50": 0.21658500003240988, "p90": 0.21662599999672238, "mean": 0.21621120000645533, "iqr": 0.00066100000140068, "raw_times": [0.2136749999976928, 0.2159649999953217, 0.21658500003240988, 0.21662599999672238, 0.21820500001012988], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2181750000431748, "peak_bytes": 13647872, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H32_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21695499998486412, "p50": 0.21774499998628016, "p90": 0.2285450000272249, "mean": 0.22256720000086716, "iqr": 0.010920000022451859, "raw_times": [0.21774499998628016, 0.21762500000477303, 0.2319660000011936, 0.21695499998486412, 0.2285450000272249], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22269599998026024, "peak_bytes": 27295744, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H8_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21440599999777987, "p50": 0.21785499995985447, "p90": 0.2335159999802272, "mean": 0.2228595999895333, "iqr": 0.01891099998374557, "raw_times": [0.21440599999777987, 0.23391600001332336, 0.21785499995985447, 0.2335159999802272, 0.21460499999648164], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21932499998911226, "peak_bytes": 13697024, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H8_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2133250000042608, "p50": 0.21422499997925115, "p90": 0.21653499999274572, "mean": 0.21708740000576654, "iqr": 0.0029589999712698045, "raw_times": [0.21357600002147592, 0.2133250000042608, 0.21653499999274572, 0.21422499997925115, 0.2277760000310991], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22739600001386862, "peak_bytes": 27394048, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H32_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21597500000325454, "p50": 0.2176859999849512, "p90": 0.21771499996248167, "mean": 0.21758339998996234, "iqr": 0.0013999999737279722, "raw_times": [0.2176859999849512, 0.21771499996248167, 0.22022600001037063, 0.2163149999887537, 0.21597500000325454], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21613599994907418, "peak_bytes": 54591488, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H32_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21577600000455277, "p50": 0.2173749999769825, "p90": 0.21900600000890336, "mean": 0.21836960000882755, "iqr": 0.0018509999790694565, "raw_times": [0.2171550000298339, 0.21577600000455277, 0.2225360000238652, 0.2173749999769825, 0.21900600000890336], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22321599999486352, "peak_bytes": 109182976, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H8_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21538499998996485, "p50": 0.21647599999141676, "p90": 0.21717500004569956, "mean": 0.2167214000110107, "iqr": 0.001030000021273736, "raw_times": [0.21717500004569956, 0.2184260000035465, 0.21538499998996485, 0.21614500002442583, 0.21647599999141676], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21872600001415776, "peak_bytes": 54788096, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H8_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21501500003751062, "p50": 0.2168760000245129, "p90": 0.2187050000088675, "mean": 0.21949320001795058, "iqr": 0.0030500000320898835, "raw_times": [0.21565499997677762, 0.2187050000088675, 0.21501500003751062, 0.23121500004208428, 0.2168760000245129], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22076499999457155, "peak_bytes": 109576192, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H32_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22937599999295344, "p50": 0.23008499999832566, "p90": 0.23144499999716572, "mean": 0.23359140000138723, "iqr": 0.0020100000028833165, "raw_times": [0.23144499999716572, 0.23008499999832566, 0.2294349999942824, 0.22937599999295344, 0.24761600002420892], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23195599999326078, "peak_bytes": 218365952, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H32_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.637245999996594, "p50": 0.6388759999822469, "p90": 0.6389449999915087, "mean": 0.6396317999929124, "iqr": 0.0012190000120426703, "raw_times": [0.6388759999822469, 0.6453660000147465, 0.6389449999915087, 0.637245999996594, 0.637725999979466], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.6364359999793123, "peak_bytes": 436731904, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T15:50:31Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H8_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0735019999638098, "p50": 0.07410199998503231, "p90": 0.07441199994673298, "mean": 0.07416379996811884, "iqr": 0.00038999996831989847, "raw_times": [0.07478099996660603, 0.0735019999638098, 0.07441199994673298, 0.07402199997841308, 0.07410199998503231], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08146199996872383, "peak_bytes": 3178496, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.590452924915553e-08, "mae_k": 1.5487040982975486e-08, "mse_q": 2.5241010080938753e-15, "mse_k": 2.364223539299626e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T15:50:31Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H8_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0912220000373054, "p50": 0.09200200003078862, "p90": 0.09276200000840618, "mean": 0.09224400001812683, "iqr": 0.0012400000173329317, "raw_times": [0.09152199999107324, 0.09276200000840618, 0.0912220000373054, 0.09200200003078862, 0.09371200002306068], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09689300003401513, "peak_bytes": 6356992, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5508486939097565e-08, "mae_k": 1.567566698668088e-08, "mse_q": 2.3630110116356316e-15, "mse_k": 2.416562128626943e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T15:50:31Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H32_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08820200002901402, "p50": 0.09085200002800775, "p90": 0.0915720000307374, "mean": 0.09087420002060753, "iqr": 0.002170000016121776, "raw_times": [0.08820200002901402, 0.09434300000066287, 0.08940200001461562, 0.0915720000307374, 0.09085200002800775], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0964319999638974, "peak_bytes": 12615680, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5856898016863852e-08, "mae_k": 1.572981211950264e-08, "mse_q": 2.4771055025978386e-15, "mse_k": 2.4544071371937915e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T15:50:31Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H32_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09069200001476929, "p50": 0.09134200001881254, "p90": 0.09142199996858835, "mean": 0.09263220000548245, "iqr": 0.0006699999630654929, "raw_times": [0.09069200001476929, 0.09075200000552286, 0.09142199996858835, 0.09895300001971918, 0.09134200001881254], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09313199996086041, "peak_bytes": 25231360, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5617658277733426e-08, "mae_k": 1.5788685914230882e-08, "mse_q": 2.4549424620164562e-15, "mse_k": 2.492823469483563e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T15:50:31Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H8_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0885120000475581, "p50": 0.08998200001997247, "p90": 0.09122199998046199, "mean": 0.09028400000943293, "iqr": 0.0016600000094513234, "raw_times": [0.09122199998046199, 0.0885120000475581, 0.09214200002816142, 0.08998200001997247, 0.08956199997101066], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.1227330000119764, "peak_bytes": 12779520, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5962712041073246e-08, "mae_k": 1.5743363945830424e-08, "mse_q": 2.534145124782417e-15, "mse_k": 2.451281585618423e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T15:50:31Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H8_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08860200000526675, "p50": 0.09058200004119499, "p90": 0.09118299999499868, "mean": 0.09031840000943703, "iqr": 0.0011699999618031143, "raw_times": [0.08860200000526675, 0.09001300003319557, 0.09058200004119499, 0.09121199997252916, 0.09118299999499868], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09078199997247793, "peak_bytes": 25427968, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.578730035589615e-08, "mae_k": 1.5859711766097462e-08, "mse_q": 2.440287521479536e-15, "mse_k": 2.477901290051784e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T15:50:31Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H32_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08772200004614206, "p50": 0.09064199997510514, "p90": 0.09105200001613412, "mean": 0.08990000001176668, "iqr": 0.002190000031987438, "raw_times": [0.08772200004614206, 0.09064199997510514, 0.09105200001613412, 0.0912220000373054, 0.08886199998414668], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09194199998319164, "peak_bytes": 50462720, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5775295736375483e-08, "mae_k": 1.5847881229547056e-08, "mse_q": 2.471039476146077e-15, "mse_k": 2.472378635235686e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H32_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08855200002244601, "p50": 0.08938199999874996, "p90": 0.0907319999896572, "mean": 0.0897739999913938, "iqr": 0.0015100000041456951, "raw_times": [0.0892219999855115, 0.0909819999606043, 0.0907319999896572, 0.08855200002244601, 0.08938199999874996], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09457200002316313, "peak_bytes": 100925440, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5959869870130206e-08, "mae_k": 1.588083975434529e-08, "mse_q": 2.510663677418633e-15, "mse_k": 2.502786271009168e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H8_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08880199999339311, "p50": 0.08953200000405559, "p90": 0.08999199997106189, "mean": 0.08967999999640597, "iqr": 0.0006899999789311551, "raw_times": [0.08880199999339311, 0.08930199999213073, 0.08999199997106189, 0.09077200002138852, 0.08953200000405559], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09282199999915974, "peak_bytes": 51118080, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5894533689220225e-08, "mae_k": 1.5873395042831362e-08, "mse_q": 2.5093181655819197e-15, "mse_k": 2.488611809911578e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H8_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08904199995640738, "p50": 0.09102199999233562, "p90": 0.09121199997252916, "mean": 0.0907579999761765, "iqr": 0.0006099999723119254, "raw_times": [0.08904199995640738, 0.09191199995939314, 0.09121199997252916, 0.09060200000021723, 0.09102199999233562], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09379199997283649, "peak_bytes": 101711872, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5936768349433805e-08, "mae_k": 1.5960043953100467e-08, "mse_q": 2.51039008577667e-15, "mse_k": 2.5111253103748867e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H32_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09005199996181545, "p50": 0.09118200000557408, "p90": 0.0916120000056253, "mean": 0.09133820000215565, "iqr": 0.0005590000000665896, "raw_times": [0.09005199996181545, 0.09105300000555872, 0.09279200003220467, 0.0916120000056253, 0.09118200000557408], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09626199999956953, "peak_bytes": 201850880, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 1.9073486328125e-06, "absmax_k": 9.5367431640625e-07, "mae_q": 1.586510300910504e-08, "mae_k": 1.5813935050346117e-08, "mse_q": 2.499836478770355e-15, "mse_k": 2.4755639026338358e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H32_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2600759999609181, "p50": 0.261636000004728, "p90": 0.2620960000285777, "mean": 0.26208780000160914, "iqr": 0.0012810000384888554, "raw_times": [0.2600759999609181, 0.261636000004728, 0.26581600002373307, 0.26081499999008884, 0.2620960000285777], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.263886000027469, "peak_bytes": 403701760, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.581049247079136e-08, "mae_k": 1.5861061797295406e-08, "mse_q": 2.4735094242202705e-15, "mse_k": 2.486832828964107e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H8_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08898199996565381, "p50": 0.09088199999496283, "p90": 0.09099199996853713, "mean": 0.09348599999157159, "iqr": 0.001969999971151992, "raw_times": [0.08898199996565381, 0.09099199996853713, 0.10755200003131904, 0.09088199999496283, 0.08902199999738514], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09600300001011419, "peak_bytes": 137396224, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5824980437173508e-08, "mae_k": 1.5615324144846454e-08, "mse_q": 2.488090249374306e-15, "mse_k": 2.425079044911585e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H8_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08880199999339311, "p50": 0.09035199997242671, "p90": 0.09093199997778356, "mean": 0.09011999998165265, "iqr": 0.0015400000279441883, "raw_times": [0.08939199994983937, 0.09093199997778356, 0.09035199997242671, 0.09112200001482051, 0.08880199999339311], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09145199999238685, "peak_bytes": 12648448, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5683587761827766e-08, "mae_k": 1.574532682013796e-08, "mse_q": 2.4310271220254415e-15, "mse_k": 2.4601385856313877e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H32_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08985199997368909, "p50": 0.09101199998440279, "p90": 0.09125200000426048, "mean": 0.09087419999787016, "iqr": 0.0002900000026784255, "raw_times": [0.08985199997368909, 0.09096200000158206, 0.0912930000254164, 0.09125200000426048, 0.09101199998440279], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09303199999521894, "peak_bytes": 25198592, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5835009747888762e-08, "mae_k": 1.572560215379326e-08, "mse_q": 2.478222950813504e-15, "mse_k": 2.4541699679685603e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H32_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08871200003568447, "p50": 0.0907719999645451, "p90": 0.09140200000956611, "mean": 0.09065600000894847, "iqr": 0.001259999976355175, "raw_times": [0.08871200003568447, 0.09225200000173572, 0.09014200003321093, 0.0907719999645451, 0.09140200000956611], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09131100000558945, "peak_bytes": 50397184, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5876850056883995e-08, "mae_k": 1.5927410501603845e-08, "mse_q": 2.504224532953606e-15, "mse_k": 2.503892919554756e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H8_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08793200004220125, "p50": 0.0902419999988524, "p90": 0.09114200003068618, "mean": 0.09024000002000321, "iqr": 0.001160000010713702, "raw_times": [0.08793200004220125, 0.08998200001997247, 0.0902419999988524, 0.09114200003068618, 0.09190200000830373], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09403199999269418, "peak_bytes": 25362432, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5820052823301012e-08, "mae_k": 1.580205122309053e-08, "mse_q": 2.4876468276264184e-15, "mse_k": 2.4866062476507165e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H8_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08730199999718025, "p50": 0.0906619999909708, "p90": 0.09115200003861901, "mean": 0.08998400001019036, "iqr": 0.0016399999935856613, "raw_times": [0.08730199999718025, 0.09115200003861901, 0.09129199997914839, 0.08951200004503335, 0.0906619999909708], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09093099998835896, "peak_bytes": 50593792, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5823172105911e-08, "mae_k": 1.582038855474366e-08, "mse_q": 2.464257071579175e-15, "mse_k": 2.4775099608301526e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H32_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08923199999344433, "p50": 0.09018200000809884, "p90": 0.09221200002684782, "mean": 0.09105200000476543, "iqr": 0.0028300000280978566, "raw_times": [0.08923199999344433, 0.09221200002684782, 0.09018200000809884, 0.08938199999874996, 0.09425199999668621], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09410199999138058, "peak_bytes": 100794368, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5888783622131086e-08, "mae_k": 1.5861886026868888e-08, "mse_q": 2.4766798685418433e-15, "mse_k": 2.475923891636419e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H32_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08850200003962527, "p50": 0.0899920000279053, "p90": 0.09176200001093093, "mean": 0.09526220001134789, "iqr": 0.002740000013545796, "raw_times": [0.08850200003962527, 0.0899920000279053, 0.11703299998089278, 0.08902199999738514, 0.09176200001093093], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09607300000880059, "peak_bytes": 201588736, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5826390864503992e-08, "mae_k": 1.5792682717119533e-08, "mse_q": 2.480465258783123e-15, "mse_k": 2.475580631534544e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H8_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0902020000239645, "p50": 0.09163200002149097, "p90": 0.09188199999243807, "mean": 0.09142600000586754, "iqr": 0.0006299999881775875, "raw_times": [0.09163200002149097, 0.09216199998718366, 0.09188199999243807, 0.09125200000426048, 0.0902020000239645], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09537199997566859, "peak_bytes": 101449728, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.592899323554775e-08, "mae_k": 1.5925031959795888e-08, "mse_q": 2.50783882253954e-15, "mse_k": 2.5015648494992274e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H8_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08815199998934986, "p50": 0.08920199996964584, "p90": 0.0900620000265917, "mean": 0.08925999999291889, "iqr": 0.001270000041131425, "raw_times": [0.08815199998934986, 0.09009199999354678, 0.0900620000265917, 0.08920199996964584, 0.08879199998546028], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09250199997268282, "peak_bytes": 202375168, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.590209919299923e-08, "mae_k": 1.590130160877834e-08, "mse_q": 2.4971026799330918e-15, "mse_k": 2.506967649153289e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H32_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.26207600001271203, "p50": 0.263255999982448, "p90": 0.2654460000144354, "mean": 0.26436599999897226, "iqr": 0.0022400000148081745, "raw_times": [0.26207600001271203, 0.263255999982448, 0.2678459999856386, 0.26320599999962724, 0.2654460000144354], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.25824599998713893, "peak_bytes": 403177472, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5847520629108658e-08, "mae_k": 1.5862454461057496e-08, "mse_q": 2.4917348203881045e-15, "mse_k": 2.491306009958557e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T15:50:32Z", "run": "acca4aa63c524f09b2ae69a99c25a06c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H32_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8428699999853961, "p50": 0.8440990000053716, "p90": 0.8457790000306886, "mean": 0.8458453999992344, "iqr": 0.0025290000280620006, "raw_times": [0.8428699999853961, 0.8532289999720888, 0.8432500000026266, 0.8440990000053716, 0.8457790000306886], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8568399999830945, "peak_bytes": 806354944, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.585225106737198e-08, "mae_k": 1.581303976649906e-08, "mse_q": 2.4866460581992374e-15, "mse_k": 2.4721545950211372e-15, "ref": "rotary_torch"}, "err": null} diff --git a/rotary/impls/cells/benchmark.py b/rotary/impls/cells/benchmark.py index 94d42ad7f4a476fdf06a84f3b75776b234ecb848..7f6fcb6c184c6611acf24218eb91d13889eaa08e 100644 --- a/rotary/impls/cells/benchmark.py +++ b/rotary/impls/cells/benchmark.py @@ -4,6 +4,7 @@ # "numpy", # "torch==2.8.0", # "kernels-benchmark-tools", +# "kernels", # ] # # [tool.uv.sources] @@ -12,46 +13,36 @@ import torch import sys from kernels_benchmark_tools import KernelTypeEnum, run_benchmark +from kernels import get_kernel +# Load the rotary kernel +rotary = get_kernel("kernels-community/rotary") -def apply_rotary_torch(x1, x2, cos, sin, conj=False): - """Reference rotary implementation.""" - if not conj: - out1 = x1 * cos - x2 * sin - out2 = x1 * sin + x2 * cos - else: - out1 = x1 * cos + x2 * sin - out2 = -x1 * sin + x2 * cos - return out1, out2 - -def torch_rotary(query, key, cos, sin, conj=False): +def hf_kernels_rotary(query, key, cos, sin, conj=False): rotary_dim = cos.shape[-1] - # Clone inputs to avoid modifying them + # Clone to avoid modifying inputs q_out = query.clone() k_out = key.clone() # Apply rotation to query q1 = q_out[..., :rotary_dim] q2 = q_out[..., rotary_dim : 2 * rotary_dim] - q_out_1, q_out_2 = apply_rotary_torch(q1, q2, cos, sin, conj) - q_out[..., :rotary_dim] = q_out_1 - q_out[..., rotary_dim : 2 * rotary_dim] = q_out_2 + rotary.apply_rotary(q1, q2, cos, sin, q1, q2, conj) # Apply rotation to key k1 = k_out[..., :rotary_dim] k2 = k_out[..., rotary_dim : 2 * rotary_dim] - k_out_1, k_out_2 = apply_rotary_torch(k1, k2, cos, sin, conj) - k_out[..., :rotary_dim] = k_out_1 - k_out[..., rotary_dim : 2 * rotary_dim] = k_out_2 + rotary.apply_rotary(k1, k2, cos, sin, k1, k2, conj) return q_out, k_out run_benchmark( kernel_type=KernelTypeEnum.ROTARY, - impl_name="torch_eager", - impl_tags={"family": "pytorch", "backend": "eager"}, - impl_func=torch_rotary, + impl_name="hf_kernels_rotary", + impl_tags={"family": "hf-kernels", "backend": "cuda"}, + impl_func=hf_kernels_rotary, + dtype="float32", ) \ No newline at end of file diff --git a/rotary/impls/hf_kernels_rotary.html b/rotary/impls/hf_kernels_rotary.html index 0608b9088d0d84399b39661fd8d9fc01a39dbda5..19809ee65c4f5e1ae23af9d359378c683a06cdd5 100644 --- a/rotary/impls/hf_kernels_rotary.html +++ b/rotary/impls/hf_kernels_rotary.html @@ -809,6 +809,14 @@ .artifact-preview svg { background: transparent; } + /* Invert SVG images in dark mode */ + :root[data-theme="dark"] .artifact-preview img[src$=".svg"] { + filter: invert(0.9) hue-rotate(180deg); + } + /* Keep SVG images readable in monocolor mode */ + :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] { + filter: none; + } /* CSV table styling */ .artifact-csv { margin-top: 1rem; @@ -3871,7 +3879,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: nv | 0.20s +Cell: nv | 0.23s | Raw @@ -3887,7 +3895,7 @@ Cell: nv | 0.20s
-
Wed Oct 29 14:26:51 2025       
+
Wed Oct 29 15:50:24 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -3896,7 +3904,7 @@ Cell: nv | 0.20s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   32C    P0             76W /  350W |       0MiB /  46068MiB |     11%      Default |
+| N/A   29C    P0             88W /  350W |       0MiB /  46068MiB |      0%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3920,13 +3928,13 @@ Cell: nv | 0.20s
 ▼ output
  ▶ uv-logs
  | 
-Cell: benchmark | 7.90s
+Cell: benchmark | 4.45s
  | 
 
 Raw
 GitHub
 
-
+
# /// script
 # requires-python = ">=3.10"
@@ -3974,6 +3982,7 @@ Cell: benchmark | 7.90s
     impl_name="hf_kernels_rotary",
     impl_tags={"family": "hf-kernels", "backend": "cuda"},
     impl_func=hf_kernels_rotary,
+    dtype="float32",
 )
 
@@ -3989,23 +3998,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 433.056us 1833.74% 433.056us 433.056us 1 - hf_kernels_rotary 12.39% 257.808us 99.67% 2.073ms 2.073ms 0.000us 0.00% 24.832us 24.832us 1 - _rotary_dba7d1e::apply_rotary 2.75% 57.199us 5.11% 106.332us 17.722us 16.960us 71.82% 16.960us 2.827us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.960us 71.82% 16.960us 2.827us 6 - aten::clone 2.11% 43.871us 79.26% 1.649ms 274.763us 0.000us 0.00% 7.872us 1.312us 6 - aten::copy_ 2.19% 45.572us 74.13% 1.542ms 256.978us 6.656us 28.18% 7.872us 1.312us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.656us 28.18% 6.656us 1.109us 6 - Activity Buffer Request 68.36% 1.422ms 68.36% 1.422ms 1.422ms 1.216us 5.15% 1.216us 1.216us 1 - aten::empty_strided 3.02% 62.841us 3.02% 62.841us 10.473us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 3.58% 74.452us 3.58% 74.452us 12.409us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.28% 47.469us 2.90% 60.410us 5.034us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.62% 12.941us 0.62% 12.941us 1.078us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.36% 49.133us 2.36% 49.133us 8.189us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.33% 6.850us 0.33% 6.850us 6.850us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 415.167us 1791.98% 415.167us 415.167us 1 + hf_kernels_rotary 12.17% 252.218us 99.63% 2.065ms 2.065ms 0.000us 0.00% 24.448us 24.448us 1 + _rotary_dba7d1e::apply_rotary 2.75% 56.920us 5.09% 105.521us 17.587us 16.128us 69.61% 16.128us 2.688us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.128us 69.61% 16.128us 2.688us 6 + aten::clone 2.00% 41.539us 79.52% 1.648ms 274.716us 0.000us 0.00% 8.320us 1.387us 6 + aten::copy_ 1.86% 38.603us 74.72% 1.549ms 258.116us 7.040us 30.39% 8.320us 1.387us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.040us 30.39% 7.040us 1.173us 6 + Activity Buffer Request 69.22% 1.435ms 69.22% 1.435ms 1.435ms 1.280us 5.52% 1.280us 1.280us 1 + aten::empty_strided 2.80% 58.062us 2.80% 58.062us 9.677us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 3.64% 75.429us 3.64% 75.429us 12.571us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.13% 44.231us 2.84% 58.952us 4.913us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.71% 14.721us 0.71% 14.721us 1.227us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 2.34% 48.601us 2.34% 48.601us 8.100us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.37% 7.760us 0.37% 7.760us 7.760us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.080ms -Self CUDA time total: 23.616us +Self CPU time total: 2.073ms +Self CUDA time total: 23.168us @@ -4015,23 +4024,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 368.319us 1559.68% 368.319us 368.319us 1 - hf_kernels_rotary 8.92% 167.782us 99.73% 1.876ms 1.876ms 0.000us 0.00% 24.767us 24.767us 1 - _rotary_dba7d1e::apply_rotary 2.34% 44.032us 4.50% 84.553us 14.092us 16.832us 71.28% 16.832us 2.805us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.832us 71.28% 16.832us 2.805us 6 - aten::clone 1.16% 21.840us 83.94% 1.579ms 263.113us 0.000us 0.00% 7.935us 1.322us 6 - aten::copy_ 2.86% 53.852us 81.07% 1.525ms 254.111us 6.783us 28.72% 7.935us 1.322us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.783us 28.72% 6.783us 1.130us 6 - Activity Buffer Request 75.10% 1.412ms 75.10% 1.412ms 1.412ms 1.152us 4.88% 1.152us 1.152us 1 - aten::empty_strided 1.71% 32.171us 1.71% 32.171us 5.362us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 3.11% 58.461us 3.11% 58.461us 9.744us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.82% 34.274us 2.37% 44.512us 3.709us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.54% 10.238us 0.54% 10.238us 0.853us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.15% 40.521us 2.15% 40.521us 6.753us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.27% 5.140us 0.27% 5.140us 5.140us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 341.917us 1420.81% 341.917us 341.917us 1 + hf_kernels_rotary 9.02% 171.465us 99.73% 1.896ms 1.896ms 0.000us 0.00% 25.377us 25.377us 1 + _rotary_dba7d1e::apply_rotary 2.21% 42.031us 4.55% 86.422us 14.404us 16.192us 67.28% 16.192us 2.699us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.192us 67.28% 16.192us 2.699us 6 + aten::clone 1.19% 22.618us 84.01% 1.597ms 266.223us 0.000us 0.00% 9.185us 1.531us 6 + aten::copy_ 2.09% 39.723us 81.08% 1.542ms 256.918us 7.873us 32.72% 9.185us 1.531us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.873us 32.72% 7.873us 1.312us 6 + Activity Buffer Request 76.21% 1.449ms 76.21% 1.449ms 1.449ms 1.312us 5.45% 1.312us 1.312us 1 + aten::empty_strided 1.75% 33.211us 1.75% 33.211us 5.535us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 2.78% 52.791us 2.78% 52.791us 8.798us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.69% 32.169us 2.16% 40.981us 3.415us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.46% 8.812us 0.46% 8.812us 0.734us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 2.33% 44.391us 2.33% 44.391us 7.399us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.27% 5.060us 0.27% 5.060us 5.060us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.881ms -Self CUDA time total: 23.615us +Self CPU time total: 1.901ms +Self CUDA time total: 24.065us @@ -4041,23 +4050,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 346.939us 1384.60% 346.939us 346.939us 1 - hf_kernels_rotary 8.57% 160.653us 99.71% 1.870ms 1.870ms 0.000us 0.00% 26.369us 26.369us 1 - _rotary_dba7d1e::apply_rotary 2.32% 43.421us 4.67% 87.601us 14.600us 17.249us 68.84% 17.249us 2.875us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 17.249us 68.84% 17.249us 2.875us 6 - aten::clone 1.23% 23.032us 84.13% 1.577ms 262.912us 0.000us 0.00% 9.120us 1.520us 6 - aten::copy_ 1.94% 36.311us 81.17% 1.522ms 253.669us 7.808us 31.16% 9.120us 1.520us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.808us 31.16% 7.808us 1.301us 6 - Activity Buffer Request 76.42% 1.433ms 76.42% 1.433ms 1.433ms 1.312us 5.24% 1.312us 1.312us 1 - aten::empty_strided 1.73% 32.420us 1.73% 32.420us 5.403us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 2.81% 52.730us 2.81% 52.730us 8.788us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.83% 34.233us 2.34% 43.964us 3.664us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.52% 9.731us 0.52% 9.731us 0.811us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.36% 44.180us 2.36% 44.180us 7.363us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.29% 5.410us 0.29% 5.410us 5.410us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 339.260us 1398.72% 339.260us 339.260us 1 + hf_kernels_rotary 9.18% 174.993us 99.74% 1.901ms 1.901ms 0.000us 0.00% 25.567us 25.567us 1 + _rotary_dba7d1e::apply_rotary 2.30% 43.881us 4.51% 86.021us 14.337us 16.479us 67.94% 16.479us 2.746us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.479us 67.94% 16.479us 2.746us 6 + aten::clone 1.43% 27.180us 83.89% 1.599ms 266.516us 0.000us 0.00% 9.088us 1.515us 6 + aten::copy_ 2.04% 38.899us 80.70% 1.538ms 256.369us 7.776us 32.06% 9.088us 1.515us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.776us 32.06% 7.776us 1.296us 6 + Activity Buffer Request 75.82% 1.445ms 75.82% 1.445ms 1.445ms 1.312us 5.41% 1.312us 1.312us 1 + aten::empty_strided 1.77% 33.702us 1.77% 33.702us 5.617us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 2.83% 54.013us 2.83% 54.013us 9.002us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.70% 32.344us 2.15% 41.003us 3.417us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.45% 8.659us 0.45% 8.659us 0.722us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 2.21% 42.140us 2.21% 42.140us 7.023us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.26% 5.011us 0.26% 5.011us 5.011us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.875ms -Self CUDA time total: 25.057us +Self CPU time total: 1.906ms +Self CUDA time total: 24.255us @@ -4067,23 +4076,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 347.904us 1355.61% 347.904us 347.904us 1 - hf_kernels_rotary 7.92% 162.592us 99.76% 2.047ms 2.047ms 0.000us 0.00% 27.009us 27.009us 1 - _rotary_dba7d1e::apply_rotary 2.09% 42.932us 4.15% 85.134us 14.189us 17.951us 69.95% 17.951us 2.992us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 17.951us 69.95% 17.951us 2.992us 6 - aten::clone 1.22% 25.009us 85.61% 1.757ms 292.750us 0.000us 0.00% 9.058us 1.510us 6 - aten::copy_ 1.81% 37.091us 82.80% 1.699ms 283.112us 7.713us 30.05% 9.058us 1.510us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.713us 30.05% 7.713us 1.285us 6 - Activity Buffer Request 69.84% 1.433ms 69.84% 1.433ms 1.433ms 1.345us 5.24% 1.345us 1.345us 1 - aten::empty_strided 1.60% 32.820us 1.60% 32.820us 5.470us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 11.14% 228.627us 11.14% 228.627us 38.104us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.59% 32.701us 2.07% 42.551us 3.546us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.48% 9.850us 0.48% 9.850us 0.821us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.06% 42.202us 2.06% 42.202us 7.034us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.24% 4.861us 0.24% 4.861us 4.861us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 337.722us 1204.77% 337.722us 337.722us 1 + hf_kernels_rotary 8.26% 171.103us 99.73% 2.067ms 2.067ms 0.000us 0.00% 29.792us 29.792us 1 + _rotary_dba7d1e::apply_rotary 1.99% 41.331us 4.00% 82.932us 13.822us 17.632us 62.90% 17.632us 2.939us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 17.632us 62.90% 17.632us 2.939us 6 + aten::clone 1.32% 27.454us 85.43% 1.770ms 295.062us 0.000us 0.00% 12.160us 2.027us 6 + aten::copy_ 1.75% 36.211us 82.55% 1.711ms 285.086us 10.400us 37.10% 12.160us 2.027us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.400us 37.10% 10.400us 1.733us 6 + Activity Buffer Request 68.85% 1.427ms 68.85% 1.427ms 1.427ms 1.760us 6.28% 1.760us 1.760us 1 + aten::empty_strided 1.56% 32.399us 1.56% 32.399us 5.400us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 11.95% 247.595us 11.95% 247.595us 41.266us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.60% 33.121us 2.04% 42.171us 3.514us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.44% 9.050us 0.44% 9.050us 0.754us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 2.01% 41.601us 2.01% 41.601us 6.933us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.27% 5.640us 0.27% 5.640us 5.640us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.052ms -Self CUDA time total: 25.664us +Self CPU time total: 2.072ms +Self CUDA time total: 28.032us @@ -4093,23 +4102,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 356.192us 1425.17% 356.192us 356.192us 1 - hf_kernels_rotary 9.03% 181.778us 99.74% 2.009ms 2.009ms 0.000us 0.00% 26.306us 26.306us 1 - _rotary_dba7d1e::apply_rotary 2.18% 43.970us 4.25% 85.660us 14.277us 17.088us 68.37% 17.088us 2.848us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 17.088us 68.37% 17.088us 2.848us 6 - aten::clone 1.16% 23.451us 84.31% 1.698ms 283.035us 0.000us 0.00% 9.218us 1.536us 6 - aten::copy_ 1.79% 36.151us 81.55% 1.643ms 273.753us 7.905us 31.63% 9.218us 1.536us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.905us 31.63% 7.905us 1.318us 6 - Activity Buffer Request 70.14% 1.413ms 70.14% 1.413ms 1.413ms 1.313us 5.25% 1.313us 1.313us 1 - aten::empty_strided 1.60% 32.242us 1.60% 32.242us 5.374us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 9.61% 193.593us 9.61% 193.593us 32.266us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.67% 33.621us 2.15% 43.371us 3.614us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.48% 9.750us 0.48% 9.750us 0.812us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.07% 41.690us 2.07% 41.690us 6.948us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.26% 5.140us 0.26% 5.140us 5.140us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 338.845us 1402.50% 338.845us 338.845us 1 + hf_kernels_rotary 8.30% 171.257us 99.77% 2.058ms 2.058ms 0.000us 0.00% 25.440us 25.440us 1 + _rotary_dba7d1e::apply_rotary 2.01% 41.399us 4.04% 83.350us 13.892us 16.448us 68.08% 16.448us 2.741us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.448us 68.08% 16.448us 2.741us 6 + aten::clone 1.39% 28.751us 85.42% 1.762ms 293.702us 0.000us 0.00% 8.992us 1.499us 6 + aten::copy_ 1.85% 38.232us 82.52% 1.702ms 283.730us 7.712us 31.92% 8.992us 1.499us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.712us 31.92% 7.712us 1.285us 6 + Activity Buffer Request 69.46% 1.433ms 69.46% 1.433ms 1.433ms 1.280us 5.30% 1.280us 1.280us 1 + aten::empty_strided 1.51% 31.081us 1.51% 31.081us 5.180us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 11.21% 231.173us 11.21% 231.173us 38.529us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.58% 32.650us 2.01% 41.461us 3.455us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.43% 8.811us 0.43% 8.811us 0.734us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 2.03% 41.951us 2.03% 41.951us 6.992us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.23% 4.720us 0.23% 4.720us 4.720us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.014ms -Self CUDA time total: 24.993us +Self CPU time total: 2.063ms +Self CUDA time total: 24.160us @@ -4119,23 +4128,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 345.469us 1341.21% 345.469us 345.469us 1 - hf_kernels_rotary 8.14% 161.605us 99.74% 1.979ms 1.979ms 0.000us 0.00% 27.070us 27.070us 1 - _rotary_dba7d1e::apply_rotary 2.10% 41.690us 4.19% 83.112us 13.852us 17.982us 69.81% 17.982us 2.997us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 17.982us 69.81% 17.982us 2.997us 6 - aten::clone 1.15% 22.842us 85.12% 1.689ms 281.515us 0.000us 0.00% 9.088us 1.515us 6 - aten::copy_ 1.84% 36.466us 82.36% 1.634ms 272.405us 7.776us 30.19% 9.088us 1.515us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.776us 30.19% 7.776us 1.296us 6 - Activity Buffer Request 71.40% 1.417ms 71.40% 1.417ms 1.417ms 1.312us 5.09% 1.312us 1.312us 1 - aten::empty_strided 1.60% 31.821us 1.60% 31.821us 5.303us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 9.12% 181.057us 9.12% 181.057us 30.176us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.80% 35.740us 2.29% 45.520us 3.793us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.49% 9.780us 0.49% 9.780us 0.815us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.09% 41.422us 2.09% 41.422us 6.904us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.26% 5.151us 0.26% 5.151us 5.151us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 335.067us 1195.26% 335.067us 335.067us 1 + hf_kernels_rotary 20.13% 167.343us 99.42% 826.269us 826.269us 0.000us 0.00% 29.857us 29.857us 1 + _rotary_dba7d1e::apply_rotary 5.16% 42.850us 10.17% 84.521us 14.087us 17.537us 62.56% 17.537us 2.923us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 17.537us 62.56% 17.537us 2.923us 6 + aten::clone 2.67% 22.181us 64.36% 534.923us 89.154us 0.000us 0.00% 12.320us 2.053us 6 + aten::copy_ 4.47% 37.140us 57.76% 480.051us 80.008us 10.496us 37.44% 12.320us 2.053us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.496us 37.44% 10.496us 1.749us 6 + Activity Buffer Request 26.04% 216.435us 26.04% 216.435us 216.435us 1.824us 6.51% 1.824us 1.824us 1 + aten::empty_strided 3.93% 32.691us 3.93% 32.691us 5.448us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 27.25% 226.476us 27.25% 226.476us 37.746us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.75% 31.143us 4.75% 39.482us 3.290us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.00% 8.339us 1.00% 8.339us 0.695us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.01% 41.671us 5.01% 41.671us 6.945us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.58% 4.851us 0.58% 4.851us 4.851us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.984ms -Self CUDA time total: 25.758us +Self CPU time total: 831.120us +Self CUDA time total: 28.033us @@ -4145,23 +4154,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 370.847us 1148.52% 370.847us 370.847us 1 - hf_kernels_rotary 8.48% 171.185us 99.77% 2.015ms 2.015ms 0.000us 0.00% 34.081us 34.081us 1 - _rotary_dba7d1e::apply_rotary 2.32% 46.763us 4.49% 90.723us 15.120us 21.793us 67.49% 21.793us 3.632us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 21.793us 67.49% 21.793us 3.632us 6 - aten::clone 1.25% 25.309us 84.59% 1.708ms 284.718us 0.000us 0.00% 12.288us 2.048us 6 - aten::copy_ 1.96% 39.631us 81.62% 1.648ms 274.723us 10.496us 32.51% 12.288us 2.048us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.496us 32.51% 10.496us 1.749us 6 - Activity Buffer Request 70.18% 1.417ms 70.18% 1.417ms 1.417ms 1.792us 5.55% 1.792us 1.792us 1 - aten::empty_strided 1.72% 34.661us 1.72% 34.661us 5.777us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 9.48% 191.424us 9.48% 191.424us 31.904us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.73% 34.932us 2.22% 44.771us 3.731us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.49% 9.839us 0.49% 9.839us 0.820us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.18% 43.960us 2.18% 43.960us 7.327us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.23% 4.601us 0.23% 4.601us 4.601us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 332.760us 825.36% 332.760us 332.760us 1 + hf_kernels_rotary 19.35% 167.193us 99.38% 858.880us 858.880us 0.000us 0.00% 43.165us 43.165us 1 + _rotary_dba7d1e::apply_rotary 4.67% 40.341us 9.39% 81.181us 13.530us 23.229us 57.62% 23.229us 3.871us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 23.229us 57.62% 23.229us 3.871us 6 + aten::clone 2.64% 22.801us 65.85% 569.083us 94.847us 0.000us 0.00% 19.936us 3.323us 6 + aten::copy_ 4.30% 37.172us 59.60% 515.092us 85.849us 17.088us 42.38% 19.936us 3.323us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.088us 42.38% 17.088us 2.848us 6 + Activity Buffer Request 29.73% 256.965us 29.73% 256.965us 256.965us 2.848us 7.06% 2.848us 2.848us 1 + aten::empty_strided 3.61% 31.190us 3.61% 31.190us 5.198us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 25.57% 220.955us 25.57% 220.955us 36.826us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.76% 32.492us 4.79% 41.423us 3.452us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.03% 8.931us 1.03% 8.931us 0.744us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 4.73% 40.840us 4.73% 40.840us 6.807us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.62% 5.380us 0.62% 5.380us 5.380us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.020ms -Self CUDA time total: 32.289us +Self CPU time total: 864.260us +Self CUDA time total: 40.317us @@ -4171,23 +4180,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 345.533us 668.21% 345.533us 345.533us 1 - hf_kernels_rotary 8.13% 161.677us 99.76% 1.983ms 1.983ms 0.000us 0.00% 54.558us 54.558us 1 - _rotary_dba7d1e::apply_rotary 2.15% 42.810us 4.29% 85.240us 14.207us 34.782us 67.26% 34.782us 5.797us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 34.782us 67.26% 34.782us 5.797us 6 - aten::clone 1.16% 23.089us 85.02% 1.690ms 281.665us 0.000us 0.00% 19.776us 3.296us 6 - aten::copy_ 1.78% 35.482us 82.32% 1.636ms 272.722us 16.928us 32.74% 19.776us 3.296us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.928us 32.74% 16.928us 2.821us 6 - Activity Buffer Request 71.53% 1.422ms 71.53% 1.422ms 1.422ms 2.848us 5.51% 2.848us 2.848us 1 - aten::empty_strided 1.54% 30.571us 1.54% 30.571us 5.095us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 9.00% 178.904us 9.00% 178.904us 29.817us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.84% 36.581us 2.32% 46.051us 3.838us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.48% 9.470us 0.48% 9.470us 0.789us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.13% 42.430us 2.13% 42.430us 7.072us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.24% 4.870us 0.24% 4.870us 4.870us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 353.785us 452.38% 353.785us 353.785us 1 + hf_kernels_rotary 18.85% 162.244us 99.38% 855.480us 855.480us 0.000us 0.00% 90.716us 90.716us 1 + aten::clone 2.60% 22.411us 65.93% 567.532us 94.589us 0.000us 0.00% 52.253us 8.709us 6 + aten::copy_ 4.73% 40.709us 58.92% 507.190us 84.532us 39.742us 50.82% 52.253us 8.709us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 39.742us 50.82% 39.742us 6.624us 6 + _rotary_dba7d1e::apply_rotary 4.83% 41.551us 9.72% 83.643us 13.941us 38.463us 49.18% 38.463us 6.410us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 38.463us 49.18% 38.463us 6.410us 6 + Activity Buffer Request 28.85% 248.356us 28.85% 248.356us 248.356us 12.511us 16.00% 12.511us 12.511us 1 + aten::empty_strided 4.41% 37.931us 4.41% 37.931us 6.322us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 25.34% 218.125us 25.34% 218.125us 36.354us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.86% 33.191us 4.89% 42.061us 3.505us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.03% 8.870us 1.03% 8.870us 0.739us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 4.89% 42.092us 4.89% 42.092us 7.015us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.62% 5.350us 0.62% 5.350us 5.350us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.988ms -Self CUDA time total: 51.710us +Self CPU time total: 860.830us +Self CUDA time total: 78.205us @@ -4197,23 +4206,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 338.136us 1047.28% 338.136us 338.136us 1 - hf_kernels_rotary 19.11% 157.801us 99.43% 820.869us 820.869us 0.000us 0.00% 34.078us 34.078us 1 - _rotary_dba7d1e::apply_rotary 5.12% 42.269us 10.18% 84.080us 14.013us 21.792us 67.49% 21.792us 3.632us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 21.792us 67.49% 21.792us 3.632us 6 - aten::clone 2.56% 21.133us 65.13% 537.684us 89.614us 0.000us 0.00% 12.286us 2.048us 6 - aten::copy_ 4.56% 37.650us 58.77% 485.172us 80.862us 10.495us 32.51% 12.286us 2.048us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.495us 32.51% 10.495us 1.749us 6 - Activity Buffer Request 32.51% 268.347us 32.51% 268.347us 268.347us 1.791us 5.55% 1.791us 1.791us 1 - aten::empty_strided 3.80% 31.379us 3.80% 31.379us 5.230us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 21.70% 179.175us 21.70% 179.175us 29.862us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.93% 32.405us 5.00% 41.304us 3.442us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.08% 8.899us 1.08% 8.899us 0.742us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.06% 41.811us 5.06% 41.811us 6.969us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.57% 4.680us 0.57% 4.680us 4.680us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 337.342us 833.35% 337.342us 337.342us 1 + hf_kernels_rotary 8.49% 173.955us 99.77% 2.043ms 2.043ms 0.000us 0.00% 43.328us 43.328us 1 + _rotary_dba7d1e::apply_rotary 2.03% 41.590us 4.02% 82.231us 13.705us 23.487us 58.02% 23.487us 3.915us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 23.487us 58.02% 23.487us 3.915us 6 + aten::clone 1.34% 27.379us 85.23% 1.745ms 290.890us 0.000us 0.00% 19.841us 3.307us 6 + aten::copy_ 1.78% 36.424us 82.41% 1.688ms 281.287us 16.993us 41.98% 19.841us 3.307us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.993us 41.98% 16.993us 2.832us 6 + Activity Buffer Request 70.11% 1.436ms 70.11% 1.436ms 1.436ms 2.848us 7.04% 2.848us 2.848us 1 + aten::empty_strided 1.48% 30.241us 1.48% 30.241us 5.040us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 10.52% 215.434us 10.52% 215.434us 35.906us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.62% 33.159us 2.03% 41.651us 3.471us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.41% 8.492us 0.41% 8.492us 0.708us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.98% 40.641us 1.98% 40.641us 6.773us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.23% 4.700us 0.23% 4.700us 4.700us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 825.549us -Self CUDA time total: 32.287us +Self CPU time total: 2.048ms +Self CUDA time total: 40.480us @@ -4223,23 +4232,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 347.832us 672.66% 347.832us 347.832us 1 - hf_kernels_rotary 18.98% 156.996us 99.42% 822.501us 822.501us 0.000us 0.00% 54.558us 54.558us 1 - _rotary_dba7d1e::apply_rotary 5.15% 42.621us 10.22% 84.512us 14.085us 34.783us 67.27% 34.783us 5.797us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 34.783us 67.27% 34.783us 5.797us 6 - aten::clone 2.65% 21.930us 64.92% 537.102us 89.517us 0.000us 0.00% 19.775us 3.296us 6 - aten::copy_ 4.53% 37.450us 58.33% 482.542us 80.424us 16.927us 32.73% 19.775us 3.296us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.927us 32.73% 16.927us 2.821us 6 - Activity Buffer Request 32.06% 265.247us 32.06% 265.247us 265.247us 2.848us 5.51% 2.848us 2.848us 1 - aten::empty_strided 3.94% 32.630us 3.94% 32.630us 5.438us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 21.74% 179.845us 21.74% 179.845us 29.974us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.14% 34.239us 5.31% 43.891us 3.658us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.17% 9.652us 1.17% 9.652us 0.804us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.06% 41.891us 5.06% 41.891us 6.982us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.58% 4.770us 0.58% 4.770us 4.770us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 367.387us 482.60% 367.387us 367.387us 1 + hf_kernels_rotary 8.43% 173.690us 99.77% 2.056ms 2.056ms 0.000us 0.00% 86.687us 86.687us 1 + aten::clone 1.25% 25.689us 84.25% 1.736ms 289.338us 0.000us 0.00% 47.648us 7.941us 6 + aten::copy_ 1.77% 36.381us 81.42% 1.678ms 279.615us 37.088us 48.72% 47.648us 7.941us 6 + _rotary_dba7d1e::apply_rotary 2.83% 58.403us 5.00% 103.123us 17.187us 39.039us 51.28% 39.039us 6.506us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 39.039us 51.28% 39.039us 6.506us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 37.088us 48.72% 37.088us 6.181us 6 + Activity Buffer Request 69.27% 1.427ms 69.27% 1.427ms 1.427ms 10.560us 13.87% 10.560us 10.560us 1 + aten::empty_strided 1.58% 32.653us 1.58% 32.653us 5.442us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 10.38% 213.985us 10.38% 213.985us 35.664us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.64% 33.864us 2.09% 42.973us 3.581us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.44% 9.109us 0.44% 9.109us 0.759us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 2.17% 44.720us 2.17% 44.720us 7.453us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.23% 4.800us 0.23% 4.800us 4.800us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 827.271us -Self CUDA time total: 51.710us +Self CPU time total: 2.061ms +Self CUDA time total: 76.127us @@ -4249,23 +4258,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 352.413us 323.34% 352.413us 352.413us 1 - hf_kernels_rotary 18.38% 152.793us 99.44% 826.801us 826.801us 0.000us 0.00% 127.423us 127.423us 1 - aten::clone 2.64% 21.959us 64.91% 539.754us 89.959us 0.000us 0.00% 69.984us 11.664us 6 - aten::copy_ 4.48% 37.251us 58.50% 486.434us 81.072us 51.552us 47.30% 69.984us 11.664us 6 - _rotary_dba7d1e::apply_rotary 5.35% 44.522us 10.55% 87.704us 14.617us 57.439us 52.70% 57.439us 9.573us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 57.439us 52.70% 57.439us 9.573us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 51.552us 47.30% 51.552us 8.592us 6 - Activity Buffer Request 32.52% 270.437us 32.52% 270.437us 270.437us 18.432us 16.91% 18.432us 18.432us 1 - aten::empty_strided 3.77% 31.361us 3.77% 31.361us 5.227us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 21.50% 178.746us 21.50% 178.746us 29.791us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.45% 36.960us 5.60% 46.550us 3.879us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.15% 9.590us 1.15% 9.590us 0.799us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.19% 43.182us 5.19% 43.182us 7.197us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.56% 4.690us 0.56% 4.690us 4.690us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 350.717us 252.54% 350.717us 350.717us 1 + hf_kernels_rotary 8.20% 168.514us 99.75% 2.049ms 2.049ms 0.000us 0.00% 162.494us 162.494us 1 + aten::clone 1.38% 28.280us 85.37% 1.754ms 292.317us 0.000us 0.00% 102.494us 17.082us 6 + aten::copy_ 1.89% 38.810us 82.42% 1.693ms 282.225us 78.878us 56.80% 102.494us 17.082us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 78.878us 56.80% 78.878us 13.146us 6 + _rotary_dba7d1e::apply_rotary 2.03% 41.642us 4.17% 85.643us 14.274us 60.000us 43.20% 60.000us 10.000us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 60.000us 43.20% 60.000us 10.000us 6 + Activity Buffer Request 70.32% 1.445ms 70.32% 1.445ms 1.445ms 23.616us 17.00% 23.616us 23.616us 1 + aten::empty_strided 1.57% 32.271us 1.57% 32.271us 5.379us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 10.22% 209.905us 10.22% 209.905us 34.984us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.59% 32.591us 2.01% 41.291us 3.441us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.42% 8.700us 0.42% 8.700us 0.725us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 2.14% 44.001us 2.14% 44.001us 7.333us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.25% 5.120us 0.25% 5.120us 5.120us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 831.491us -Self CUDA time total: 108.991us +Self CPU time total: 2.054ms +Self CUDA time total: 138.878us @@ -4275,23 +4284,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 354.429us 196.77% 354.429us 354.429us 1 - hf_kernels_rotary 18.96% 156.272us 99.48% 819.980us 819.980us 0.000us 0.00% 203.900us 203.900us 1 - aten::clone 2.73% 22.479us 64.84% 534.473us 89.079us 0.000us 0.00% 102.557us 17.093us 6 - aten::copy_ 4.31% 35.551us 58.35% 480.933us 80.156us 78.782us 43.74% 102.557us 17.093us 6 - _rotary_dba7d1e::apply_rotary 5.14% 42.393us 10.35% 85.274us 14.212us 101.343us 56.26% 101.343us 16.890us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 101.343us 56.26% 101.343us 16.890us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 78.782us 43.74% 78.782us 13.130us 6 - Activity Buffer Request 32.52% 268.027us 32.52% 268.027us 268.027us 23.775us 13.20% 23.775us 23.775us 1 - aten::empty_strided 3.77% 31.061us 3.77% 31.061us 5.177us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 21.52% 177.355us 21.52% 177.355us 29.559us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.12% 33.982us 5.33% 43.961us 3.663us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.21% 9.979us 1.21% 9.979us 0.832us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.20% 42.881us 5.20% 42.881us 7.147us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.52% 4.300us 0.52% 4.300us 4.300us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 7.33% 173.322us 87.05% 2.058ms 2.058ms 0.000us 0.00% 773.117us 773.117us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 713.886us 101.15% 713.886us 713.886us 1 + aten::clone 1.12% 26.510us 74.03% 1.750ms 291.637us 0.000us 0.00% 574.975us 95.829us 6 + aten::copy_ 1.66% 39.271us 70.83% 1.674ms 279.060us 507.647us 71.93% 574.975us 95.829us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 507.647us 71.93% 507.647us 84.608us 6 + _rotary_dba7d1e::apply_rotary 1.93% 45.683us 3.90% 92.264us 15.377us 198.142us 28.07% 198.142us 33.024us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 198.142us 28.07% 198.142us 33.024us 6 + Activity Buffer Request 60.04% 1.419ms 60.04% 1.419ms 1.419ms 67.328us 9.54% 67.328us 67.328us 1 + aten::empty_strided 2.07% 48.953us 2.07% 48.953us 8.159us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 9.13% 215.804us 9.13% 215.804us 35.967us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.42% 33.620us 1.79% 42.281us 3.523us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.37% 8.661us 0.37% 8.661us 0.722us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.97% 46.581us 1.97% 46.581us 7.764us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 12.95% 306.087us 12.95% 306.087us 306.087us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 824.280us -Self CUDA time total: 180.125us +Self CPU time total: 2.364ms +Self CUDA time total: 705.789us @@ -4301,23 +4310,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 338.587us 1293.50% 338.587us 338.587us 1 - hf_kernels_rotary 19.34% 157.366us 99.42% 808.960us 808.960us 0.000us 0.00% 27.296us 27.296us 1 - _rotary_dba7d1e::apply_rotary 5.26% 42.761us 10.55% 85.842us 14.307us 19.392us 74.08% 19.392us 3.232us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 19.392us 74.08% 19.392us 3.232us 6 - aten::clone 2.60% 21.121us 64.41% 524.052us 87.342us 0.000us 0.00% 7.904us 1.317us 6 - aten::copy_ 4.60% 37.442us 58.06% 472.441us 78.740us 6.784us 25.92% 7.904us 1.317us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.784us 25.92% 6.784us 1.131us 6 - Activity Buffer Request 31.61% 257.196us 31.61% 257.196us 257.196us 1.120us 4.28% 1.120us 1.120us 1 - aten::empty_strided 3.75% 30.490us 3.75% 30.490us 5.082us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 21.85% 177.803us 21.85% 177.803us 29.634us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.95% 32.140us 5.12% 41.700us 3.475us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.17% 9.560us 1.17% 9.560us 0.797us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.29% 43.081us 5.29% 43.081us 7.180us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.58% 4.711us 0.58% 4.711us 4.711us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 345.724us 1295.48% 345.724us 345.724us 1 + hf_kernels_rotary 8.56% 176.117us 99.77% 2.053ms 2.053ms 0.000us 0.00% 27.999us 27.999us 1 + _rotary_dba7d1e::apply_rotary 2.01% 41.279us 4.04% 83.070us 13.845us 18.817us 70.51% 18.817us 3.136us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 18.817us 70.51% 18.817us 3.136us 6 + aten::clone 1.59% 32.672us 85.14% 1.752ms 292.043us 0.000us 0.00% 9.182us 1.530us 6 + aten::copy_ 1.83% 37.700us 81.90% 1.685ms 280.910us 7.870us 29.49% 9.182us 1.530us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.870us 29.49% 7.870us 1.312us 6 + Activity Buffer Request 70.01% 1.441ms 70.01% 1.441ms 1.441ms 1.312us 4.92% 1.312us 1.312us 1 + aten::empty_strided 1.66% 34.130us 1.66% 34.130us 5.688us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 10.06% 206.965us 10.06% 206.965us 34.494us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.59% 32.681us 2.04% 41.911us 3.493us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.45% 9.230us 0.45% 9.230us 0.769us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 2.03% 41.791us 2.03% 41.791us 6.965us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.23% 4.640us 0.23% 4.640us 4.640us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 813.671us -Self CUDA time total: 26.176us +Self CPU time total: 2.058ms +Self CUDA time total: 26.687us @@ -4327,23 +4336,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 349.862us 1278.50% 349.862us 349.862us 1 - hf_kernels_rotary 19.32% 156.134us 99.42% 803.460us 803.460us 0.000us 0.00% 28.709us 28.709us 1 - _rotary_dba7d1e::apply_rotary 5.33% 43.099us 10.84% 87.643us 14.607us 19.428us 71.00% 19.428us 3.238us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 19.428us 71.00% 19.428us 3.238us 6 - aten::clone 2.80% 22.600us 63.71% 514.893us 85.816us 0.000us 0.00% 9.281us 1.547us 6 - aten::copy_ 4.89% 39.481us 56.99% 460.582us 76.764us 7.937us 29.00% 9.281us 1.547us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.937us 29.00% 7.937us 1.323us 6 - Activity Buffer Request 27.85% 225.076us 27.85% 225.076us 225.076us 1.344us 4.91% 1.344us 1.344us 1 - aten::empty_strided 3.92% 31.711us 3.92% 31.711us 5.285us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 24.26% 196.025us 24.26% 196.025us 32.671us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.38% 35.400us 5.54% 44.790us 3.732us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.16% 9.390us 1.16% 9.390us 0.782us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.51% 44.544us 5.51% 44.544us 7.424us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.58% 4.720us 0.58% 4.720us 4.720us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 334.455us 1253.30% 334.455us 334.455us 1 + hf_kernels_rotary 18.38% 152.071us 99.44% 822.719us 822.719us 0.000us 0.00% 27.966us 27.966us 1 + _rotary_dba7d1e::apply_rotary 5.29% 43.741us 10.38% 85.902us 14.317us 18.975us 71.10% 18.975us 3.163us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 18.975us 71.10% 18.975us 3.163us 6 + aten::clone 2.47% 20.399us 65.75% 544.023us 90.670us 0.000us 0.00% 8.991us 1.498us 6 + aten::copy_ 4.79% 39.600us 59.70% 493.952us 82.325us 7.711us 28.90% 8.991us 1.498us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.711us 28.90% 7.711us 1.285us 6 + Activity Buffer Request 30.27% 250.456us 30.27% 250.456us 250.456us 1.280us 4.80% 1.280us 1.280us 1 + aten::empty_strided 3.59% 29.672us 3.59% 29.672us 4.945us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 24.64% 203.896us 24.64% 203.896us 33.983us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.84% 31.802us 4.92% 40.723us 3.394us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.08% 8.921us 1.08% 8.921us 0.743us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.10% 42.161us 5.10% 42.161us 7.027us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.56% 4.640us 0.56% 4.640us 4.640us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 808.180us -Self CUDA time total: 27.365us +Self CPU time total: 827.359us +Self CUDA time total: 26.686us @@ -4353,23 +4362,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 349.981us 1235.85% 349.981us 349.981us 1 - hf_kernels_rotary 8.03% 161.215us 99.76% 2.003ms 2.003ms 0.000us 0.00% 29.663us 29.663us 1 - _rotary_dba7d1e::apply_rotary 2.11% 42.422us 4.23% 84.982us 14.164us 20.544us 72.54% 20.544us 3.424us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 20.544us 72.54% 20.544us 3.424us 6 - aten::clone 1.12% 22.572us 85.29% 1.712ms 285.349us 0.000us 0.00% 9.119us 1.520us 6 - aten::copy_ 1.91% 38.260us 82.54% 1.657ms 276.143us 7.775us 27.46% 9.119us 1.520us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.775us 27.46% 7.775us 1.296us 6 - Activity Buffer Request 71.67% 1.439ms 71.67% 1.439ms 1.439ms 1.344us 4.75% 1.344us 1.344us 1 - aten::empty_strided 1.63% 32.660us 1.63% 32.660us 5.443us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 8.96% 179.936us 8.96% 179.936us 29.989us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.74% 34.910us 2.20% 44.250us 3.688us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.47% 9.340us 0.47% 9.340us 0.778us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.12% 42.560us 2.12% 42.560us 7.093us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.24% 4.741us 0.24% 4.741us 4.741us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 335.033us 1090.71% 335.033us 335.033us 1 + hf_kernels_rotary 18.88% 152.276us 99.36% 801.289us 801.289us 0.000us 0.00% 32.445us 32.445us 1 + _rotary_dba7d1e::apply_rotary 5.08% 40.990us 10.38% 83.672us 13.945us 20.127us 65.52% 20.127us 3.354us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 20.127us 65.52% 20.127us 3.354us 6 + aten::clone 2.64% 21.299us 65.14% 525.331us 87.555us 0.000us 0.00% 12.318us 2.053us 6 + aten::copy_ 5.22% 42.109us 58.69% 473.291us 78.882us 10.590us 34.48% 12.318us 2.053us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.590us 34.48% 10.590us 1.765us 6 + Activity Buffer Request 26.39% 212.815us 26.39% 212.815us 212.815us 1.728us 5.63% 1.728us 1.728us 1 + aten::empty_strided 3.81% 30.741us 3.81% 30.741us 5.123us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 27.08% 218.367us 27.08% 218.367us 36.394us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.88% 31.271us 4.96% 40.010us 3.334us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.08% 8.739us 1.08% 8.739us 0.728us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.29% 42.682us 5.29% 42.682us 7.114us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.64% 5.180us 0.64% 5.180us 5.180us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.007ms -Self CUDA time total: 28.319us +Self CPU time total: 806.469us +Self CUDA time total: 30.717us @@ -4379,23 +4388,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 346.238us 971.27% 346.238us 346.238us 1 - hf_kernels_rotary 8.04% 160.124us 99.76% 1.988ms 1.988ms 0.000us 0.00% 37.440us 37.440us 1 - _rotary_dba7d1e::apply_rotary 2.20% 43.921us 4.24% 84.493us 14.082us 25.216us 70.74% 25.216us 4.203us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 25.216us 70.74% 25.216us 4.203us 6 - aten::clone 1.14% 22.762us 85.30% 1.700ms 283.325us 0.000us 0.00% 12.224us 2.037us 6 - aten::copy_ 1.84% 36.620us 82.53% 1.645ms 274.105us 10.432us 29.26% 12.224us 2.037us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.432us 29.26% 10.432us 1.739us 6 - Activity Buffer Request 71.70% 1.429ms 71.70% 1.429ms 1.429ms 1.792us 5.03% 1.792us 1.792us 1 - aten::empty_strided 1.63% 32.561us 1.63% 32.561us 5.427us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 8.99% 179.114us 8.99% 179.114us 29.852us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.72% 34.250us 2.18% 43.390us 3.616us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.46% 9.140us 0.46% 9.140us 0.762us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.04% 40.572us 2.04% 40.572us 6.762us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.24% 4.860us 0.24% 4.860us 4.860us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 336.501us 787.14% 336.501us 336.501us 1 + hf_kernels_rotary 18.58% 152.224us 99.36% 814.019us 814.019us 0.000us 0.00% 45.662us 45.662us 1 + _rotary_dba7d1e::apply_rotary 5.02% 41.151us 10.13% 82.992us 13.832us 25.695us 60.11% 25.695us 4.283us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 25.695us 60.11% 25.695us 4.283us 6 + aten::clone 2.59% 21.259us 65.61% 537.562us 89.594us 0.000us 0.00% 19.967us 3.328us 6 + aten::copy_ 4.69% 38.391us 59.23% 485.282us 80.880us 17.055us 39.89% 19.967us 3.328us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.055us 39.89% 17.055us 2.842us 6 + Activity Buffer Request 29.84% 244.476us 29.84% 244.476us 244.476us 2.912us 6.81% 2.912us 2.912us 1 + aten::empty_strided 3.79% 31.021us 3.79% 31.021us 5.170us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 24.71% 202.415us 24.71% 202.415us 33.736us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.95% 32.360us 5.03% 41.241us 3.437us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.08% 8.881us 1.08% 8.881us 0.740us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.11% 41.841us 5.11% 41.841us 6.973us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.64% 5.261us 0.64% 5.261us 5.261us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.993ms -Self CUDA time total: 35.648us +Self CPU time total: 819.280us +Self CUDA time total: 42.750us @@ -4405,23 +4414,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 347.675us 1229.10% 347.675us 347.675us 1 - hf_kernels_rotary 8.06% 160.274us 99.76% 1.984ms 1.984ms 0.000us 0.00% 29.631us 29.631us 1 - _rotary_dba7d1e::apply_rotary 2.18% 43.331us 4.28% 85.164us 14.194us 20.511us 72.51% 20.511us 3.418us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 20.511us 72.51% 20.511us 3.418us 6 - aten::clone 1.13% 22.531us 85.26% 1.696ms 282.610us 0.000us 0.00% 9.120us 1.520us 6 - aten::copy_ 1.97% 39.252us 82.52% 1.641ms 273.528us 7.776us 27.49% 9.120us 1.520us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.776us 27.49% 7.776us 1.296us 6 - Activity Buffer Request 71.58% 1.424ms 71.58% 1.424ms 1.424ms 1.344us 4.75% 1.344us 1.344us 1 - aten::empty_strided 1.61% 31.959us 1.61% 31.959us 5.326us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 8.97% 178.354us 8.97% 178.354us 29.726us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.68% 33.430us 2.16% 42.920us 3.577us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.48% 9.490us 0.48% 9.490us 0.791us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.10% 41.833us 2.10% 41.833us 6.972us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.24% 4.801us 0.24% 4.801us 4.801us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 330.306us 1080.84% 330.306us 330.306us 1 + hf_kernels_rotary 18.42% 149.681us 99.42% 807.979us 807.979us 0.000us 0.00% 32.321us 32.321us 1 + _rotary_dba7d1e::apply_rotary 5.10% 41.443us 10.19% 82.853us 13.809us 20.128us 65.86% 20.128us 3.355us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 20.128us 65.86% 20.128us 3.355us 6 + aten::clone 2.61% 21.203us 65.84% 535.084us 89.181us 0.000us 0.00% 12.193us 2.032us 6 + aten::copy_ 4.51% 36.639us 59.42% 482.940us 80.490us 10.432us 34.14% 12.193us 2.032us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.432us 34.14% 10.432us 1.739us 6 + Activity Buffer Request 29.60% 240.586us 29.60% 240.586us 240.586us 1.761us 5.76% 1.761us 1.761us 1 + aten::empty_strided 3.81% 30.941us 3.81% 30.941us 5.157us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 25.31% 205.715us 25.31% 205.715us 34.286us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.91% 31.761us 4.97% 40.361us 3.363us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.06% 8.600us 1.06% 8.600us 0.717us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.10% 41.410us 5.10% 41.410us 6.902us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.58% 4.711us 0.58% 4.711us 4.711us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.989ms -Self CUDA time total: 28.287us +Self CPU time total: 812.690us +Self CUDA time total: 30.560us @@ -4431,23 +4440,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 341.434us 959.52% 341.434us 341.434us 1 - hf_kernels_rotary 20.68% 156.375us 99.37% 751.248us 751.248us 0.000us 0.00% 37.312us 37.312us 1 - _rotary_dba7d1e::apply_rotary 5.66% 42.780us 11.14% 84.232us 14.039us 25.184us 70.77% 25.184us 4.197us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 25.184us 70.77% 25.184us 4.197us 6 - aten::clone 3.01% 22.779us 61.92% 468.081us 78.014us 0.000us 0.00% 12.128us 2.021us 6 - aten::copy_ 4.78% 36.161us 54.65% 413.150us 68.858us 10.400us 29.23% 12.128us 2.021us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.400us 29.23% 10.400us 1.733us 6 - Activity Buffer Request 26.22% 198.225us 26.22% 198.225us 198.225us 1.728us 4.86% 1.728us 1.728us 1 - aten::empty_strided 4.25% 32.152us 4.25% 32.152us 5.359us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 23.65% 178.764us 23.65% 178.764us 29.794us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.40% 33.290us 5.63% 42.560us 3.547us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.23% 9.270us 1.23% 9.270us 0.773us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.48% 41.452us 5.48% 41.452us 6.909us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.63% 4.741us 0.63% 4.741us 4.741us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 339.800us 797.80% 339.800us 339.800us 1 + hf_kernels_rotary 14.79% 151.874us 99.55% 1.022ms 1.022ms 0.000us 0.00% 45.440us 45.440us 1 + _rotary_dba7d1e::apply_rotary 4.15% 42.610us 8.29% 85.131us 14.188us 25.536us 59.95% 25.536us 4.256us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 25.536us 59.95% 25.536us 4.256us 6 + aten::clone 2.08% 21.390us 72.46% 743.968us 123.995us 0.000us 0.00% 19.904us 3.317us 6 + aten::copy_ 3.85% 39.501us 67.34% 691.417us 115.236us 17.056us 40.05% 19.904us 3.317us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.056us 40.05% 17.056us 2.843us 6 + Activity Buffer Request 43.74% 449.121us 43.74% 449.121us 449.121us 2.848us 6.69% 2.848us 2.848us 1 + aten::empty_strided 3.03% 31.161us 3.03% 31.161us 5.193us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 19.75% 202.795us 19.75% 202.795us 33.799us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.15% 32.321us 4.01% 41.121us 3.427us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.86% 8.800us 0.86% 8.800us 0.733us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 4.14% 42.521us 4.14% 42.521us 7.087us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.45% 4.640us 0.45% 4.640us 4.640us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 755.989us -Self CUDA time total: 35.584us +Self CPU time total: 1.027ms +Self CUDA time total: 42.592us @@ -4457,23 +4466,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 349.886us 617.06% 349.886us 349.886us 1 - hf_kernels_rotary 15.93% 158.238us 99.46% 988.285us 988.285us 0.000us 0.00% 59.582us 59.582us 1 - _rotary_dba7d1e::apply_rotary 4.43% 44.009us 8.77% 87.171us 14.528us 39.742us 70.09% 39.742us 6.624us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 39.742us 70.09% 39.742us 6.624us 6 - aten::clone 2.20% 21.907us 70.33% 698.845us 116.474us 0.000us 0.00% 19.840us 3.307us 6 - aten::copy_ 3.76% 37.392us 65.02% 646.067us 107.678us 16.960us 29.91% 19.840us 3.307us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.960us 29.91% 16.960us 2.827us 6 - Activity Buffer Request 43.30% 430.221us 43.30% 430.221us 430.221us 2.880us 5.08% 2.880us 2.880us 1 - aten::empty_strided 3.11% 30.871us 3.11% 30.871us 5.145us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 17.96% 178.454us 17.96% 178.454us 29.742us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.43% 34.051us 4.43% 44.031us 3.669us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.00% 9.980us 1.00% 9.980us 0.832us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 4.34% 43.162us 4.34% 43.162us 7.194us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.54% 5.320us 0.54% 5.320us 5.320us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 339.355us 384.24% 339.355us 339.355us 1 + hf_kernels_rotary 17.67% 151.144us 99.35% 849.740us 849.740us 0.000us 0.00% 103.359us 103.359us 1 + aten::clone 2.51% 21.431us 67.11% 573.953us 95.659us 0.000us 0.00% 62.527us 10.421us 6 + aten::copy_ 4.55% 38.879us 60.94% 521.222us 86.870us 47.487us 53.77% 62.527us 10.421us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 47.487us 53.77% 47.487us 7.915us 6 + _rotary_dba7d1e::apply_rotary 5.04% 43.111us 9.90% 84.683us 14.114us 40.832us 46.23% 40.832us 6.805us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 40.832us 46.23% 40.832us 6.805us 6 + Activity Buffer Request 33.08% 282.937us 33.08% 282.937us 282.937us 15.040us 17.03% 15.040us 15.040us 1 + aten::empty_strided 3.66% 31.300us 3.66% 31.300us 5.217us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 23.31% 199.406us 23.31% 199.406us 33.234us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.68% 31.469us 4.67% 39.960us 3.330us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.99% 8.491us 0.99% 8.491us 0.708us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 4.86% 41.572us 4.86% 41.572us 6.929us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.65% 5.540us 0.65% 5.540us 5.540us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 993.605us -Self CUDA time total: 56.702us +Self CPU time total: 855.280us +Self CUDA time total: 88.319us @@ -4483,23 +4492,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 352.574us 297.38% 352.574us 352.574us 1 - hf_kernels_rotary 18.56% 157.003us 99.43% 841.041us 841.041us 0.000us 0.00% 135.680us 135.680us 1 - aten::clone 2.59% 21.881us 65.75% 556.174us 92.696us 0.000us 0.00% 69.984us 11.664us 6 - aten::copy_ 4.37% 36.992us 59.34% 501.912us 83.652us 52.864us 44.59% 69.984us 11.664us 6 - _rotary_dba7d1e::apply_rotary 5.11% 43.221us 10.14% 85.754us 14.292us 65.696us 55.41% 65.696us 10.949us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 65.696us 55.41% 65.696us 10.949us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 52.864us 44.59% 52.864us 8.811us 6 - Activity Buffer Request 33.65% 284.597us 33.65% 284.597us 284.597us 17.120us 14.44% 17.120us 17.120us 1 - aten::empty_strided 3.83% 32.381us 3.83% 32.381us 5.397us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 21.32% 180.323us 21.32% 180.323us 30.054us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.89% 32.880us 4.98% 42.110us 3.509us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.09% 9.230us 1.09% 9.230us 0.769us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.03% 42.533us 5.03% 42.533us 7.089us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.57% 4.810us 0.57% 4.810us 4.810us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 340.473us 232.72% 340.473us 340.473us 1 + hf_kernels_rotary 20.27% 170.703us 99.41% 837.069us 837.069us 0.000us 0.00% 170.267us 170.267us 1 + aten::clone 2.43% 20.451us 64.32% 541.612us 90.269us 0.000us 0.00% 106.876us 17.813us 6 + aten::copy_ 4.52% 38.100us 58.32% 491.079us 81.846us 82.909us 56.67% 106.876us 17.813us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 82.909us 56.67% 82.909us 13.818us 6 + _rotary_dba7d1e::apply_rotary 5.01% 42.193us 10.04% 84.563us 14.094us 63.391us 43.33% 63.391us 10.565us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 63.391us 43.33% 63.391us 10.565us 6 + Activity Buffer Request 30.43% 256.245us 30.43% 256.245us 256.245us 23.967us 16.38% 23.967us 23.967us 1 + aten::empty_strided 3.57% 30.082us 3.57% 30.082us 5.014us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 23.36% 196.734us 23.36% 196.734us 32.789us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.79% 31.880us 4.77% 40.191us 3.349us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.99% 8.311us 0.99% 8.311us 0.693us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.03% 42.370us 5.03% 42.370us 7.062us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.59% 4.950us 0.59% 4.950us 4.950us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 845.851us -Self CUDA time total: 118.560us +Self CPU time total: 842.019us +Self CUDA time total: 146.300us @@ -4509,23 +4518,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 341.982us 603.45% 341.982us 341.982us 1 - hf_kernels_rotary 18.98% 155.712us 99.43% 815.710us 815.710us 0.000us 0.00% 59.487us 59.487us 1 - _rotary_dba7d1e::apply_rotary 5.25% 43.112us 10.37% 85.045us 14.174us 39.839us 70.30% 39.839us 6.640us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 39.839us 70.30% 39.839us 6.640us 6 - aten::clone 2.51% 20.600us 64.82% 531.763us 88.627us 0.000us 0.00% 19.648us 3.275us 6 - aten::copy_ 4.52% 37.100us 58.54% 480.262us 80.044us 16.832us 29.70% 19.648us 3.275us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.832us 29.70% 16.832us 2.805us 6 - Activity Buffer Request 32.45% 266.237us 32.45% 266.237us 266.237us 2.816us 4.97% 2.816us 2.816us 1 - aten::empty_strided 3.77% 30.901us 3.77% 30.901us 5.150us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 21.57% 176.925us 21.57% 176.925us 29.488us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.05% 33.240us 5.26% 43.190us 3.599us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.21% 9.950us 1.21% 9.950us 0.829us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.11% 41.933us 5.11% 41.933us 6.989us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.57% 4.700us 0.57% 4.700us 4.700us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 372.221us 493.09% 372.221us 372.221us 1 + hf_kernels_rotary 8.48% 174.314us 99.77% 2.050ms 2.050ms 0.000us 0.00% 82.399us 82.399us 1 + _rotary_dba7d1e::apply_rotary 1.96% 40.332us 4.07% 83.672us 13.945us 41.887us 55.49% 41.887us 6.981us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 41.887us 55.49% 41.887us 6.981us 6 + aten::clone 1.40% 28.690us 85.15% 1.749ms 291.567us 0.000us 0.00% 40.512us 6.752us 6 + aten::copy_ 2.00% 41.101us 81.21% 1.669ms 278.085us 33.600us 44.51% 40.512us 6.752us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 33.600us 44.51% 33.600us 5.600us 6 + Activity Buffer Request 69.57% 1.429ms 69.57% 1.429ms 1.429ms 6.912us 9.16% 6.912us 6.912us 1 + aten::empty_strided 2.54% 52.203us 2.54% 52.203us 8.701us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 9.64% 198.094us 9.64% 198.094us 33.016us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.63% 33.409us 2.06% 42.400us 3.533us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.44% 8.991us 0.44% 8.991us 0.749us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 2.11% 43.340us 2.11% 43.340us 7.223us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.23% 4.681us 0.23% 4.681us 4.681us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 820.410us -Self CUDA time total: 56.671us +Self CPU time total: 2.054ms +Self CUDA time total: 75.487us @@ -4535,23 +4544,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 388.726us 325.86% 388.726us 388.726us 1 - hf_kernels_rotary 19.76% 169.936us 99.45% 855.401us 855.401us 0.000us 0.00% 136.923us 136.923us 1 - aten::clone 2.64% 22.710us 63.15% 543.123us 90.521us 0.000us 0.00% 70.877us 11.813us 6 - aten::copy_ 4.46% 38.370us 56.50% 485.931us 80.988us 53.246us 44.64% 70.877us 11.813us 6 - _rotary_dba7d1e::apply_rotary 5.64% 48.490us 10.91% 93.801us 15.634us 66.046us 55.36% 66.046us 11.008us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 66.046us 55.36% 66.046us 11.008us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.246us 44.64% 53.246us 8.874us 6 - Activity Buffer Request 30.83% 265.147us 30.83% 265.147us 265.147us 17.631us 14.78% 17.631us 17.631us 1 - aten::empty_strided 4.01% 34.482us 4.01% 34.482us 5.747us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 21.21% 182.414us 21.21% 182.414us 30.402us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.39% 37.781us 5.64% 48.541us 4.045us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.25% 10.760us 1.25% 10.760us 0.897us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.27% 45.311us 5.27% 45.311us 7.552us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.55% 4.700us 0.55% 4.700us 4.700us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 352.925us 241.86% 352.925us 352.925us 1 + hf_kernels_rotary 8.45% 172.475us 99.74% 2.036ms 2.036ms 0.000us 0.00% 169.664us 169.664us 1 + aten::clone 1.31% 26.731us 85.08% 1.737ms 289.500us 0.000us 0.00% 105.664us 17.611us 6 + aten::copy_ 1.93% 39.410us 82.15% 1.677ms 279.535us 81.920us 56.14% 105.664us 17.611us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 81.920us 56.14% 81.920us 13.653us 6 + _rotary_dba7d1e::apply_rotary 2.10% 42.891us 4.20% 85.681us 14.280us 64.000us 43.86% 64.000us 10.667us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 64.000us 43.86% 64.000us 10.667us 6 + Activity Buffer Request 70.60% 1.441ms 70.60% 1.441ms 1.441ms 23.744us 16.27% 23.744us 23.744us 1 + aten::empty_strided 1.62% 33.061us 1.62% 33.061us 5.510us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 9.62% 196.364us 9.62% 196.364us 32.727us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.58% 32.223us 2.02% 41.242us 3.437us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.44% 9.019us 0.44% 9.019us 0.752us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 2.10% 42.790us 2.10% 42.790us 7.132us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.26% 5.250us 0.26% 5.250us 5.250us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 860.101us -Self CUDA time total: 119.292us +Self CPU time total: 2.042ms +Self CUDA time total: 145.920us @@ -4561,23 +4570,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 357.115us 181.96% 357.115us 357.115us 1 - hf_kernels_rotary 18.86% 155.885us 99.43% 821.750us 821.750us 0.000us 0.00% 219.904us 219.904us 1 - _rotary_dba7d1e::apply_rotary 5.36% 44.321us 10.59% 87.561us 14.594us 115.808us 59.01% 115.808us 19.301us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 115.808us 59.01% 115.808us 19.301us 6 - aten::clone 2.51% 20.740us 64.81% 535.643us 89.274us 0.000us 0.00% 104.096us 17.349us 6 - aten::copy_ 4.34% 35.891us 58.73% 485.402us 80.900us 80.448us 40.99% 104.096us 17.349us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 80.448us 40.99% 80.448us 13.408us 6 - Activity Buffer Request 32.66% 269.957us 32.66% 269.957us 269.957us 23.648us 12.05% 23.648us 23.648us 1 - aten::empty_strided 3.57% 29.501us 3.57% 29.501us 4.917us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 21.72% 179.554us 21.72% 179.554us 29.926us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.97% 32.801us 5.16% 42.661us 3.555us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.19% 9.860us 1.19% 9.860us 0.822us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.23% 43.240us 5.23% 43.240us 7.207us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.57% 4.750us 0.57% 4.750us 4.750us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 19.65% 213.486us 76.93% 835.739us 835.739us 0.000us 0.00% 746.680us 746.680us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 687.960us 101.16% 687.960us 687.960us 1 + aten::clone 1.98% 21.549us 45.41% 493.341us 82.224us 0.000us 0.00% 557.242us 92.874us 6 + aten::copy_ 3.46% 37.632us 40.54% 440.431us 73.405us 490.619us 72.14% 557.242us 92.874us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 490.619us 72.14% 490.619us 81.770us 6 + _rotary_dba7d1e::apply_rotary 4.15% 45.032us 8.07% 87.672us 14.612us 189.438us 27.86% 189.438us 31.573us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 189.438us 27.86% 189.438us 31.573us 6 + Activity Buffer Request 19.06% 207.035us 19.06% 207.035us 207.035us 66.623us 9.80% 66.623us 66.623us 1 + aten::empty_strided 2.89% 31.361us 2.89% 31.361us 5.227us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 18.02% 195.764us 18.02% 195.764us 32.627us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.96% 32.192us 3.80% 41.240us 3.437us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.83% 9.048us 0.83% 9.048us 0.754us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 3.93% 42.640us 3.93% 42.640us 7.107us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 23.07% 250.625us 23.07% 250.625us 250.625us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 826.500us -Self CUDA time total: 196.256us +Self CPU time total: 1.086ms +Self CUDA time total: 680.057us @@ -4587,60 +4596,60 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 13.04% 159.984us 66.42% 814.800us 814.800us 0.000us 0.00% 847.705us 847.705us 1 - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 789.466us 101.01% 789.466us 789.466us 1 - aten::clone 1.84% 22.521us 42.98% 527.184us 87.864us 0.000us 0.00% 577.883us 96.314us 6 - aten::copy_ 2.96% 36.311us 38.61% 473.681us 78.947us 511.772us 65.48% 577.883us 96.314us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 511.772us 65.48% 511.772us 85.295us 6 - _rotary_dba7d1e::apply_rotary 3.59% 44.023us 6.92% 84.943us 14.157us 269.822us 34.52% 269.822us 44.970us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 269.822us 34.52% 269.822us 44.970us 6 - Activity Buffer Request 21.07% 258.456us 21.07% 258.456us 258.456us 66.111us 8.46% 66.111us 66.111us 1 - aten::empty_strided 2.53% 30.982us 2.53% 30.982us 5.164us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 14.58% 178.914us 14.58% 178.914us 29.819us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.74% 33.620us 3.48% 42.689us 3.557us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.74% 9.069us 0.74% 9.069us 0.756us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 3.34% 40.920us 3.34% 40.920us 6.820us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 33.58% 411.910us 33.58% 411.910us 411.910us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 5.18% 149.334us 27.72% 799.139us 799.139us 0.000us 0.00% 2.625ms 2.625ms 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 2.454ms 100.33% 2.454ms 2.454ms 1 + aten::clone 0.74% 21.311us 18.12% 522.352us 87.059us 0.000us 0.00% 1.393ms 232.143us 6 + aten::copy_ 1.34% 38.611us 16.30% 469.821us 78.303us 1.214ms 49.62% 1.393ms 232.143us 6 + _rotary_dba7d1e::apply_rotary 1.48% 42.661us 3.03% 87.271us 14.545us 1.232ms 50.38% 1.232ms 205.327us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 1.232ms 50.38% 1.232ms 205.327us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 1.214ms 49.62% 1.214ms 202.255us 6 + Activity Buffer Request 8.25% 237.786us 8.25% 237.786us 237.786us 179.327us 7.33% 179.327us 179.327us 1 + aten::empty_strided 1.08% 31.220us 1.08% 31.220us 5.203us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 6.71% 193.424us 6.71% 193.424us 32.237us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.11% 31.861us 1.39% 40.182us 3.349us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.29% 8.321us 0.29% 8.321us 0.693us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.55% 44.610us 1.55% 44.610us 7.435us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 72.28% 2.083ms 72.28% 2.083ms 2.083ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.227ms -Self CUDA time total: 781.594us +Self CPU time total: 2.882ms +Self CUDA time total: 2.445ms impl wl p50(ms) ok -hf_kernels_rotary cuda_B1_S128_H32_D128_R64 0.09 False -hf_kernels_rotary cuda_B1_S128_H32_D64_R32 0.09 False -hf_kernels_rotary cuda_B1_S128_H8_D128_R64 0.10 False -hf_kernels_rotary cuda_B1_S128_H8_D64_R32 0.08 False -hf_kernels_rotary cuda_B1_S2048_H32_D128_R64 0.09 False -hf_kernels_rotary cuda_B1_S2048_H32_D64_R32 0.09 False -hf_kernels_rotary cuda_B1_S2048_H8_D128_R64 0.09 False -hf_kernels_rotary cuda_B1_S2048_H8_D64_R32 0.09 False -hf_kernels_rotary cuda_B1_S512_H32_D128_R64 0.09 False -hf_kernels_rotary cuda_B1_S512_H32_D64_R32 0.09 False -hf_kernels_rotary cuda_B1_S512_H8_D128_R64 0.09 False -hf_kernels_rotary cuda_B1_S512_H8_D64_R32 0.09 False -hf_kernels_rotary cuda_B2_S128_H32_D128_R64 0.09 False -hf_kernels_rotary cuda_B2_S128_H32_D64_R32 0.09 False -hf_kernels_rotary cuda_B2_S128_H8_D128_R64 0.09 False -hf_kernels_rotary cuda_B2_S128_H8_D64_R32 0.09 False -hf_kernels_rotary cuda_B2_S2048_H32_D128_R64 0.28 False -hf_kernels_rotary cuda_B2_S2048_H32_D64_R32 0.10 False -hf_kernels_rotary cuda_B2_S2048_H8_D128_R64 0.09 False -hf_kernels_rotary cuda_B2_S2048_H8_D64_R32 0.09 False -hf_kernels_rotary cuda_B2_S512_H32_D128_R64 0.09 False -hf_kernels_rotary cuda_B2_S512_H32_D64_R32 0.09 False -hf_kernels_rotary cuda_B2_S512_H8_D128_R64 0.09 False -hf_kernels_rotary cuda_B2_S512_H8_D64_R32 0.09 False +hf_kernels_rotary cuda_B1_S128_H32_D128_R64 0.09 True +hf_kernels_rotary cuda_B1_S128_H32_D64_R32 0.09 True +hf_kernels_rotary cuda_B1_S128_H8_D128_R64 0.09 True +hf_kernels_rotary cuda_B1_S128_H8_D64_R32 0.07 True +hf_kernels_rotary cuda_B1_S2048_H32_D128_R64 0.26 True +hf_kernels_rotary cuda_B1_S2048_H32_D64_R32 0.09 True +hf_kernels_rotary cuda_B1_S2048_H8_D128_R64 0.09 True +hf_kernels_rotary cuda_B1_S2048_H8_D64_R32 0.09 True +hf_kernels_rotary cuda_B1_S512_H32_D128_R64 0.09 True +hf_kernels_rotary cuda_B1_S512_H32_D64_R32 0.09 True +hf_kernels_rotary cuda_B1_S512_H8_D128_R64 0.09 True +hf_kernels_rotary cuda_B1_S512_H8_D64_R32 0.09 True +hf_kernels_rotary cuda_B2_S128_H32_D128_R64 0.09 True +hf_kernels_rotary cuda_B2_S128_H32_D64_R32 0.09 True +hf_kernels_rotary cuda_B2_S128_H8_D128_R64 0.09 True +hf_kernels_rotary cuda_B2_S128_H8_D64_R32 0.09 True +hf_kernels_rotary cuda_B2_S2048_H32_D128_R64 0.84 True +hf_kernels_rotary cuda_B2_S2048_H32_D64_R32 0.26 True +hf_kernels_rotary cuda_B2_S2048_H8_D128_R64 0.09 True +hf_kernels_rotary cuda_B2_S2048_H8_D64_R32 0.09 True +hf_kernels_rotary cuda_B2_S512_H32_D128_R64 0.09 True +hf_kernels_rotary cuda_B2_S512_H32_D64_R32 0.09 True +hf_kernels_rotary cuda_B2_S512_H8_D128_R64 0.09 True +hf_kernels_rotary cuda_B2_S512_H8_D64_R32 0.09 True
▶ UV Install Logs
Fetching 5 files: 0%| | 0/5 [00:00<?, ?it/s] -Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 22.14it/s] -Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 22.12it/s]
+Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 15.01it/s] +Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 15.00it/s]

Artifacts:

rotary.jsonl diff --git a/rotary/impls/torch_rotary.html b/rotary/impls/torch_rotary.html index 7606a093a65d04c40d580abf67d210368fd50dcd..530497fb2e3de16d32a4b610b8643a5793f056bb 100644 --- a/rotary/impls/torch_rotary.html +++ b/rotary/impls/torch_rotary.html @@ -809,6 +809,14 @@ .artifact-preview svg { background: transparent; } + /* Invert SVG images in dark mode */ + :root[data-theme="dark"] .artifact-preview img[src$=".svg"] { + filter: invert(0.9) hue-rotate(180deg); + } + /* Keep SVG images readable in monocolor mode */ + :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] { + filter: none; + } /* CSV table styling */ .artifact-csv { margin-top: 1rem; @@ -3871,7 +3879,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: nv | 0.20s +Cell: nv | 0.23s | Raw @@ -3887,7 +3895,7 @@ Cell: nv | 0.20s
-
Wed Oct 29 14:26:51 2025       
+
Wed Oct 29 15:50:24 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -3896,7 +3904,7 @@ Cell: nv | 0.20s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   32C    P0             76W /  350W |       0MiB /  46068MiB |     11%      Default |
+| N/A   29C    P0             88W /  350W |       0MiB /  46068MiB |      0%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3918,9 +3926,9 @@ Cell: nv | 0.20s
 
 ▼ code 
 ▼ output
- ▶ uv-logs
+ ▶ uv-logs
  | 
-Cell: benchmark | 3.84s
+Cell: benchmark | 7.50s
  | 
 
 Raw
@@ -3999,27 +4007,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.124ms      1261.56%       1.124ms       1.124ms             1  
-                                            torch_eager        14.73%     412.767us        99.72%       2.794ms       2.794ms       0.000us         0.00%      90.337us      90.337us             1  
-                                              aten::mul         6.25%     175.043us        11.07%     310.105us      12.921us      46.912us        52.64%      46.912us       1.955us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      46.912us        52.64%      46.912us       1.955us            24  
-                                            aten::copy_         4.12%     115.463us        61.76%       1.730ms      96.132us      28.993us        32.53%      30.210us       1.678us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.368us        25.10%      22.368us       1.864us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.215us        14.83%      13.215us       1.101us            12  
-                                            aten::clone         1.31%      36.692us        59.66%       1.671ms     278.565us       0.000us         0.00%       7.842us       1.307us             6  
-                                              aten::sub         1.68%      47.063us         2.72%      76.213us      12.702us       6.655us         7.47%       6.655us       1.109us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.625us         7.43%       6.625us       1.104us             6  
-                                              aten::add         1.39%      39.044us         2.34%      65.583us      10.930us       6.560us         7.36%       6.560us       1.093us             6  
-                                Activity Buffer Request        52.45%       1.470ms        52.45%       1.470ms       1.470ms       1.217us         1.37%       1.217us       1.217us             1  
-                                    aten::empty_strided         1.99%      55.621us         1.99%      55.621us       9.270us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         2.66%      74.431us         2.66%      74.431us      12.405us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.98%      83.492us         3.80%     106.494us       4.437us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.82%      23.002us         0.82%      23.002us       0.958us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         9.34%     261.675us         9.34%     261.675us       5.452us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.28%       7.890us         0.28%       7.890us       7.890us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.033ms      1157.58%       1.033ms       1.033ms             1  
+                                            torch_eager        14.26%     386.998us        99.70%       2.705ms       2.705ms       0.000us         0.00%      90.431us      90.431us             1  
+                                              aten::mul         6.08%     164.867us        10.45%     283.577us      11.816us      46.976us        52.65%      46.976us       1.957us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      46.976us        52.65%      46.976us       1.957us            24  
+                                            aten::copy_         3.96%     107.533us        62.14%       1.686ms      93.665us      28.959us        32.46%      30.175us       1.676us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.303us        25.00%      22.303us       1.859us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.280us        14.89%      13.280us       1.107us            12  
+                                            aten::clone         1.58%      42.971us        61.19%       1.660ms     276.703us       0.000us         0.00%       7.872us       1.312us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.656us         7.46%       6.656us       1.109us             6  
+                                              aten::sub         1.73%      46.871us         2.69%      72.911us      12.152us       6.656us         7.46%       6.656us       1.109us             6  
+                                              aten::add         1.35%      36.531us         2.16%      58.672us       9.779us       6.624us         7.42%       6.624us       1.104us             6  
+                                Activity Buffer Request        53.14%       1.442ms        53.14%       1.442ms       1.442ms       1.216us         1.36%       1.216us       1.216us             1  
+                                    aten::empty_strided         2.28%      61.772us         2.28%      61.772us      10.295us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         2.73%      74.144us         2.73%      74.144us      12.357us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.20%      86.920us         4.13%     112.081us       4.670us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.93%      25.161us         0.93%      25.161us       1.048us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.45%     229.371us         8.45%     229.371us       4.779us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.30%       8.270us         0.30%       8.270us       8.270us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.802ms
-Self CUDA time total: 89.120us
+Self CPU time total: 2.713ms
+Self CUDA time total: 89.215us
 
 
 
@@ -4029,27 +4037,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     968.092us      1071.28%     968.092us     968.092us             1  
-                                            torch_eager        12.50%     317.076us        99.79%       2.532ms       2.532ms       0.000us         0.00%      91.488us      91.488us             1  
-                                              aten::mul         6.07%     153.959us        10.35%     262.528us      10.939us      47.648us        52.73%      47.648us       1.985us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      47.648us        52.73%      47.648us       1.985us            24  
-                                            aten::copy_         4.16%     105.603us        65.14%       1.653ms      91.828us      29.344us        32.47%      30.464us       1.692us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.592us        25.00%      22.592us       1.883us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.376us        14.80%      13.376us       1.115us            12  
-                                            aten::clone         1.12%      28.391us        62.74%       1.592ms     265.351us       0.000us         0.00%       7.872us       1.312us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.752us         7.47%       6.752us       1.125us             6  
-                                              aten::sub         1.55%      39.261us         2.49%      63.132us      10.522us       6.688us         7.40%       6.688us       1.115us             6  
-                                              aten::add         1.47%      37.180us         2.35%      59.741us       9.957us       6.688us         7.40%       6.688us       1.115us             6  
-                                Activity Buffer Request        56.17%       1.425ms        56.17%       1.425ms       1.425ms       1.120us         1.24%       1.120us       1.120us             1  
-                                    aten::empty_strided         2.04%      51.662us         2.04%      51.662us       8.610us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         2.12%      53.792us         2.12%      53.792us       8.965us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.04%      77.153us         3.82%      96.932us       4.039us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.78%      19.779us         0.78%      19.779us       0.824us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.79%     223.101us         8.79%     223.101us       4.648us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.21%       5.210us         0.21%       5.210us       5.210us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     920.346us      1019.88%     920.346us     920.346us             1  
+                                            torch_eager        11.67%     287.669us        99.75%       2.459ms       2.459ms       0.000us         0.00%      91.392us      91.392us             1  
+                                              aten::mul         5.97%     147.150us        10.47%     258.131us      10.755us      47.681us        52.84%      47.681us       1.987us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      47.681us        52.84%      47.681us       1.987us            24  
+                                            aten::copy_         4.01%      98.743us        66.94%       1.650ms      91.665us      29.184us        32.34%      30.335us       1.685us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.433us        24.86%      22.433us       1.869us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.376us        14.82%      13.376us       1.115us            12  
+                                            aten::clone         0.96%      23.772us        64.13%       1.581ms     263.446us       0.000us         0.00%       7.902us       1.317us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.751us         7.48%       6.751us       1.125us             6  
+                                              aten::sub         1.51%      37.314us         2.51%      61.954us      10.326us       6.720us         7.45%       6.720us       1.120us             6  
+                                              aten::add         1.33%      32.821us         2.21%      54.451us       9.075us       6.656us         7.38%       6.656us       1.109us             6  
+                                Activity Buffer Request        58.20%       1.434ms        58.20%       1.434ms       1.434ms       1.151us         1.28%       1.151us       1.151us             1  
+                                    aten::empty_strided         1.33%      32.830us         1.33%      32.830us       5.472us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         2.21%      54.420us         2.21%      54.420us       9.070us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.84%      69.900us         3.65%      89.853us       3.744us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.81%      19.953us         0.81%      19.953us       0.831us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.92%     219.731us         8.92%     219.731us       4.578us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.25%       6.050us         0.25%       6.050us       6.050us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.538ms
-Self CUDA time total: 90.368us
+Self CPU time total: 2.465ms
+Self CUDA time total: 90.241us
 
 
 
@@ -4059,27 +4067,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.007ms      1071.77%       1.007ms       1.007ms             1  
-                                            torch_eager        12.81%     333.813us        99.77%       2.600ms       2.600ms       0.000us         0.00%      95.234us      95.234us             1  
-                                              aten::mul         6.17%     160.752us        10.75%     280.063us      11.669us      48.706us        51.86%      48.706us       2.029us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      48.706us        51.86%      48.706us       2.029us            24  
-                                            aten::copy_         4.30%     112.081us        64.85%       1.690ms      93.891us      30.753us        32.74%      32.065us       1.781us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      23.009us        24.50%      23.009us       1.917us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.463us        15.40%      14.463us       1.205us            12  
-                                            aten::clone         1.08%      28.070us        62.18%       1.621ms     270.093us       0.000us         0.00%       9.056us       1.509us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.744us         8.25%       7.744us       1.291us             6  
-                                              aten::sub         1.50%      39.201us         2.50%      65.063us      10.844us       7.263us         7.73%       7.263us       1.211us             6  
-                                              aten::add         1.40%      36.592us         2.30%      59.882us       9.980us       7.200us         7.67%       7.200us       1.200us             6  
-                                Activity Buffer Request        55.61%       1.449ms        55.61%       1.449ms       1.449ms       1.312us         1.40%       1.312us       1.312us             1  
-                                    aten::empty_strided         1.87%      48.773us         1.87%      48.773us       8.129us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         2.21%      57.593us         2.21%      57.593us       9.599us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.85%      74.230us         3.62%      94.450us       3.935us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.78%      20.220us         0.78%      20.220us       0.842us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         9.19%     239.464us         9.19%     239.464us       4.989us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.23%       5.970us         0.23%       5.970us       5.970us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     909.568us       966.47%     909.568us     909.568us             1  
+                                            torch_eager        11.23%     276.876us        99.79%       2.460ms       2.460ms       0.000us         0.00%      95.424us      95.424us             1  
+                                              aten::mul         6.27%     154.461us        10.66%     262.794us      10.950us      48.800us        51.85%      48.800us       2.033us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      48.800us        51.85%      48.800us       2.033us            24  
+                                            aten::copy_         4.02%      99.094us        67.67%       1.668ms      92.677us      30.912us        32.85%      32.224us       1.790us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      23.008us        24.45%      23.008us       1.917us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.400us        15.30%      14.400us       1.200us            12  
+                                            aten::clone         0.93%      22.950us        64.64%       1.593ms     265.583us       0.000us         0.00%       9.216us       1.536us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.904us         8.40%       7.904us       1.317us             6  
+                                              aten::sub         1.56%      38.564us         2.52%      62.034us      10.339us       7.200us         7.65%       7.200us       1.200us             6  
+                                              aten::add         1.24%      30.660us         2.12%      52.250us       8.708us       7.200us         7.65%       7.200us       1.200us             6  
+                                Activity Buffer Request        58.87%       1.451ms        58.87%       1.451ms       1.451ms       1.312us         1.39%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.24%      30.531us         1.24%      30.531us       5.089us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         2.20%      54.240us         2.20%      54.240us       9.040us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.65%      65.401us         3.42%      84.323us       3.513us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.77%      18.922us         0.77%      18.922us       0.788us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.80%     216.993us         8.80%     216.993us       4.521us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.21%       5.190us         0.21%       5.190us       5.190us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.606ms
-Self CUDA time total: 93.922us
+Self CPU time total: 2.465ms
+Self CUDA time total: 94.112us
 
 
 
@@ -4089,27 +4097,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     976.889us       967.02%     976.889us     976.889us             1  
-                                            torch_eager        12.01%     329.416us        99.82%       2.739ms       2.739ms       0.000us         0.00%     102.333us     102.333us             1  
-                                              aten::mul         5.67%     155.545us         9.73%     266.927us      11.122us      52.800us        52.27%      52.800us       2.200us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      52.800us        52.27%      52.800us       2.200us            24  
-                                            aten::copy_         3.82%     104.765us        68.18%       1.871ms     103.922us      32.349us        32.02%      33.661us       1.870us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.574us        24.33%      24.574us       2.048us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      15.872us        15.71%      15.872us       1.323us            12  
-                                            aten::clone         1.07%      29.290us        65.23%       1.790ms     298.277us       0.000us         0.00%       9.087us       1.515us             6  
-                                              aten::sub         1.39%      38.150us         2.28%      62.431us      10.405us       7.936us         7.86%       7.936us       1.323us             6  
-                                              aten::add         1.24%      34.113us         2.07%      56.743us       9.457us       7.936us         7.86%       7.936us       1.323us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.775us         7.70%       7.775us       1.296us             6  
-                                Activity Buffer Request        52.33%       1.436ms        52.33%       1.436ms       1.436ms       1.312us         1.30%       1.312us       1.312us             1  
-                                    aten::empty_strided         1.16%      31.821us         1.16%      31.821us       5.304us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         9.42%     258.335us         9.42%     258.335us      43.056us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.63%      72.071us         3.33%      91.411us       3.809us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.70%      19.340us         0.70%      19.340us       0.806us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.39%     230.176us         8.39%     230.176us       4.795us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.18%       5.010us         0.18%       5.010us       5.010us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     892.572us       880.74%     892.572us     892.572us             1  
+                                            torch_eager        11.35%     283.366us        99.78%       2.492ms       2.492ms       0.000us         0.00%     102.687us     102.687us             1  
+                                              aten::mul         5.93%     148.202us        10.19%     254.513us      10.605us      52.956us        52.25%      52.956us       2.207us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      52.956us        52.25%      52.956us       2.207us            24  
+                                            aten::copy_         3.94%      98.395us        68.27%       1.705ms      94.725us      32.482us        32.05%      33.826us       1.879us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.641us        24.31%      24.641us       2.053us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      15.905us        15.69%      15.905us       1.325us            12  
+                                            aten::clone         0.86%      21.380us        65.50%       1.636ms     272.651us       0.000us         0.00%       9.185us       1.531us             6  
+                                              aten::add         1.24%      31.000us         2.12%      53.041us       8.840us       8.032us         7.93%       8.032us       1.339us             6  
+                                              aten::sub         1.40%      35.052us         2.32%      58.022us       9.670us       7.873us         7.77%       7.873us       1.312us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.841us         7.74%       7.841us       1.307us             6  
+                                Activity Buffer Request        52.43%       1.309ms        52.43%       1.309ms       1.309ms       1.344us         1.33%       1.344us       1.344us             1  
+                                    aten::empty_strided         1.32%      33.071us         1.32%      33.071us       5.512us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         9.52%     237.764us         9.52%     237.764us      39.627us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.60%      64.825us         3.35%      83.624us       3.484us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.75%      18.799us         0.75%      18.799us       0.783us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.44%     210.793us         8.44%     210.793us       4.392us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.22%       5.611us         0.22%       5.611us       5.611us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.744ms
-Self CUDA time total: 101.021us
+Self CPU time total: 2.498ms
+Self CUDA time total: 101.343us
 
 
 
@@ -4119,27 +4127,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     972.954us      1035.95%     972.954us     972.954us             1  
-                                            torch_eager        11.82%     323.628us        99.83%       2.734ms       2.734ms       0.000us         0.00%      95.231us      95.231us             1  
-                                              aten::mul         5.48%     150.092us         9.71%     265.906us      11.079us      48.958us        52.13%      48.958us       2.040us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      48.958us        52.13%      48.958us       2.040us            24  
-                                            aten::copy_         4.01%     109.805us        68.55%       1.878ms     104.307us      30.784us        32.78%      32.096us       1.783us            18  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     907.478us       966.25%     907.478us     907.478us             1  
+                                            torch_eager        11.02%     305.318us        99.81%       2.765ms       2.765ms       0.000us         0.00%      95.230us      95.230us             1  
+                                              aten::mul         5.24%     145.172us         9.20%     254.787us      10.616us      49.023us        52.20%      49.023us       2.043us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      49.023us        52.20%      49.023us       2.043us            24  
+                                            aten::copy_         3.74%     103.536us        70.23%       1.945ms     108.067us      30.719us        32.71%      32.031us       1.779us            18  
 void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.912us        24.40%      22.912us       1.909us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.177us        15.09%      14.177us       1.181us            12  
-                                            aten::clone         0.98%      26.740us        65.50%       1.794ms     299.012us       0.000us         0.00%       9.184us       1.531us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.872us         8.38%       7.872us       1.312us             6  
-                                              aten::sub         1.35%      37.100us         2.22%      60.781us      10.130us       7.106us         7.57%       7.106us       1.184us             6  
-                                              aten::add         1.26%      34.471us         2.07%      56.641us       9.440us       7.071us         7.53%       7.071us       1.178us             6  
-                                Activity Buffer Request        53.28%       1.459ms        53.28%       1.459ms       1.459ms       1.312us         1.40%       1.312us       1.312us             1  
-                                    aten::empty_strided         1.12%      30.591us         1.12%      30.591us       5.098us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         8.84%     242.034us         8.84%     242.034us      40.339us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.64%      72.284us         3.37%      92.363us       3.848us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.73%      20.079us         0.73%      20.079us       0.837us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.33%     228.067us         8.33%     228.067us       4.751us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.17%       4.701us         0.17%       4.701us       4.701us       0.000us         0.00%       0.000us       0.000us             1  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.176us        15.09%      14.176us       1.181us            12  
+                                            aten::clone         1.09%      30.110us        67.87%       1.880ms     313.329us       0.000us         0.00%       9.119us       1.520us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.807us         8.31%       7.807us       1.301us             6  
+                                              aten::sub         1.24%      34.480us         2.10%      58.270us       9.712us       7.104us         7.56%       7.104us       1.184us             6  
+                                              aten::add         1.09%      30.091us         1.87%      51.880us       8.647us       7.072us         7.53%       7.072us       1.179us             6  
+                                Activity Buffer Request        52.12%       1.444ms        52.12%       1.444ms       1.444ms       1.312us         1.40%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.13%      31.430us         1.13%      31.430us       5.238us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        12.15%     336.439us        12.15%     336.439us      56.073us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.48%      68.768us         3.17%      87.719us       3.655us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.68%      18.951us         0.68%      18.951us       0.790us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         7.82%     216.674us         7.82%     216.674us       4.514us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.19%       5.210us         0.19%       5.210us       5.210us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.739ms
-Self CUDA time total: 93.919us
+Self CPU time total: 2.770ms
+Self CUDA time total: 93.918us
 
 
 
@@ -4149,27 +4157,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     940.506us       929.78%     940.506us     940.506us             1  
-                                            torch_eager        10.47%     280.203us        99.80%       2.672ms       2.672ms       0.000us         0.00%     102.466us     102.466us             1  
-                                              aten::mul         5.68%     151.942us         9.93%     265.874us      11.078us      52.767us        52.17%      52.767us       2.199us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      52.767us        52.17%      52.767us       2.199us            24  
-                                            aten::copy_         3.99%     106.699us        69.68%       1.866ms     103.641us      32.384us        32.01%      33.696us       1.872us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.672us        24.39%      24.672us       2.056us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.003us        15.82%      16.003us       1.334us            12  
-                                            aten::clone         0.80%      21.540us        66.42%       1.778ms     296.379us       0.000us         0.00%       9.024us       1.504us             6  
-                                              aten::sub         1.42%      38.052us         2.40%      64.133us      10.689us       8.002us         7.91%       8.002us       1.334us             6  
-                                              aten::add         1.23%      32.860us         2.10%      56.182us       9.364us       8.001us         7.91%       8.001us       1.333us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.712us         7.62%       7.712us       1.285us             6  
-                                Activity Buffer Request        54.45%       1.458ms        54.45%       1.458ms       1.458ms       1.312us         1.30%       1.312us       1.312us             1  
-                                    aten::empty_strided         1.14%      30.450us         1.14%      30.450us       5.075us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         8.74%     234.006us         8.74%     234.006us      39.001us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.58%      69.109us         3.28%      87.850us       3.660us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.70%      18.741us         0.70%      18.741us       0.781us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.61%     230.527us         8.61%     230.527us       4.803us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.20%       5.400us         0.20%       5.400us       5.400us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     917.786us       906.49%     917.786us     917.786us             1  
+                                            torch_eager        10.59%     290.695us        99.81%       2.741ms       2.741ms       0.000us         0.00%     102.558us     102.558us             1  
+                                              aten::mul         5.39%     148.136us         9.30%     255.477us      10.645us      52.735us        52.09%      52.735us       2.197us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      52.735us        52.09%      52.735us       2.197us            24  
+                                            aten::copy_         4.15%     114.085us        70.69%       1.941ms     107.839us      32.512us        32.11%      33.824us       1.879us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.704us        24.40%      24.704us       2.059us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      15.999us        15.80%      15.999us       1.333us            12  
+                                            aten::clone         0.78%      21.500us        67.65%       1.858ms     309.627us       0.000us         0.00%       9.120us       1.520us             6  
+                                              aten::sub         1.39%      38.270us         2.26%      62.070us      10.345us       8.063us         7.96%       8.063us       1.344us             6  
+                                              aten::add         1.13%      31.111us         1.93%      52.881us       8.813us       7.936us         7.84%       7.936us       1.323us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.808us         7.71%       7.808us       1.301us             6  
+                                Activity Buffer Request        52.71%       1.447ms        52.71%       1.447ms       1.447ms       1.312us         1.30%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.19%      32.762us         1.19%      32.762us       5.460us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        11.56%     317.516us        11.56%     317.516us      52.919us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.38%      65.270us         3.07%      84.260us       3.511us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.69%      18.990us         0.69%      18.990us       0.791us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         7.83%     214.935us         7.83%     214.935us       4.478us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.19%       5.200us         0.19%       5.200us       5.200us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.677ms
-Self CUDA time total: 101.154us
+Self CPU time total: 2.746ms
+Self CUDA time total: 101.246us
 
 
 
@@ -4179,27 +4187,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.015ms       844.44%       1.015ms       1.015ms             1  
-                                            torch_eager        10.99%     299.529us        99.80%       2.720ms       2.720ms       0.000us         0.00%     122.045us     122.045us             1  
-                                              aten::mul         5.97%     162.734us        10.28%     280.227us      11.676us      61.856us        51.45%      61.856us       2.577us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      61.856us        51.45%      61.856us       2.577us            24  
-                                            aten::copy_         4.97%     135.364us        68.63%       1.870ms     103.912us      39.199us        32.61%      41.023us       2.279us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.704us        23.88%      28.704us       2.392us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.166us        15.94%      19.166us       1.597us            12  
-                                            aten::clone         0.84%      22.992us        64.39%       1.755ms     292.512us       0.000us         0.00%      12.319us       2.053us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.495us         8.73%      10.495us       1.749us             6  
-                                              aten::add         1.19%      32.530us         2.08%      56.691us       9.448us       9.598us         7.98%       9.598us       1.600us             6  
-                                              aten::sub         1.40%      38.111us         2.30%      62.811us      10.468us       9.568us         7.96%       9.568us       1.595us             6  
-                                Activity Buffer Request        52.53%       1.432ms        52.53%       1.432ms       1.432ms       1.824us         1.52%       1.824us       1.824us             1  
-                                    aten::empty_strided         1.18%      32.290us         1.18%      32.290us       5.382us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         8.53%     232.585us         8.53%     232.585us      38.764us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.71%      73.938us         3.49%      95.000us       3.958us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.77%      21.062us         0.77%      21.062us       0.878us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.70%     237.086us         8.70%     237.086us       4.939us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.20%       5.570us         0.20%       5.570us       5.570us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     896.601us       744.17%     896.601us     896.601us             1  
+                                            torch_eager        10.66%     286.835us        99.81%       2.687ms       2.687ms       0.000us         0.00%     122.275us     122.275us             1  
+                                              aten::mul         5.47%     147.118us         9.41%     253.291us      10.554us      61.985us        51.45%      61.985us       2.583us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      61.985us        51.45%      61.985us       2.583us            24  
+                                            aten::copy_         3.72%     100.260us        70.38%       1.894ms     105.246us      39.265us        32.59%      41.057us       2.281us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.834us        23.93%      28.834us       2.403us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.233us        15.96%      19.233us       1.603us            12  
+                                            aten::clone         0.83%      22.211us        67.89%       1.827ms     304.542us       0.000us         0.00%      12.223us       2.037us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.431us         8.66%      10.431us       1.738us             6  
+                                              aten::add         1.14%      30.799us         1.94%      52.140us       8.690us       9.632us         7.99%       9.632us       1.605us             6  
+                                              aten::sub         1.37%      36.770us         2.23%      59.970us       9.995us       9.601us         7.97%       9.601us       1.600us             6  
+                                Activity Buffer Request        53.18%       1.431ms        53.18%       1.431ms       1.431ms       1.792us         1.49%       1.792us       1.792us             1  
+                                    aten::empty_strided         1.21%      32.491us         1.21%      32.491us       5.415us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        11.26%     303.147us        11.26%     303.147us      50.525us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.49%      66.932us         3.17%      85.280us       3.553us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.68%      18.348us         0.68%      18.348us       0.765us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         7.81%     210.347us         7.81%     210.347us       4.382us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.19%       5.020us         0.19%       5.020us       5.020us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.726ms
-Self CUDA time total: 120.221us
+Self CPU time total: 2.692ms
+Self CUDA time total: 120.483us
 
 
 
@@ -4209,27 +4217,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     951.101us       552.87%     951.101us     951.101us             1  
-                                            torch_eager        11.67%     313.772us        99.81%       2.683ms       2.683ms       0.000us         0.00%     174.878us     174.878us             1  
-                                              aten::mul         5.73%     154.081us         9.89%     265.836us      11.076us      89.599us        52.08%      89.599us       3.733us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      89.599us        52.08%      89.599us       3.733us            24  
-                                            aten::copy_         3.89%     104.453us        68.40%       1.838ms     102.128us      57.664us        33.52%      60.512us       3.362us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.832us        23.74%      40.832us       3.403us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.767us        14.40%      24.767us       2.064us            12  
-                                            aten::clone         1.01%      27.120us        65.39%       1.758ms     292.937us       0.000us         0.00%      19.680us       3.280us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.832us         9.78%      16.832us       2.805us             6  
-                                              aten::add         1.27%      34.231us         2.14%      57.531us       9.588us      12.416us         7.22%      12.416us       2.069us             6  
-                                              aten::sub         1.34%      36.001us         2.22%      59.581us       9.930us      12.351us         7.18%      12.351us       2.059us             6  
-                                Activity Buffer Request        53.45%       1.437ms        53.45%       1.437ms       1.437ms       2.848us         1.66%       2.848us       2.848us             1  
-                                    aten::empty_strided         1.13%      30.290us         1.13%      30.290us       5.048us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         8.55%     229.865us         8.55%     229.865us      38.311us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.63%      70.721us         3.36%      90.322us       3.763us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.73%      19.601us         0.73%      19.601us       0.817us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.41%     225.976us         8.41%     225.976us       4.708us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.19%       5.001us         0.19%       5.001us       5.001us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     885.202us       514.56%     885.202us     885.202us             1  
+                                            torch_eager        18.81%     279.303us        99.64%       1.480ms       1.480ms       0.000us         0.00%     174.944us     174.944us             1  
+                                              aten::mul         9.70%     144.115us        16.98%     252.116us      10.505us      89.439us        51.99%      89.439us       3.727us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      89.439us        51.99%      89.439us       3.727us            24  
+                                            aten::copy_         6.85%     101.723us        47.28%     702.206us      39.011us      57.632us        33.50%      60.544us       3.364us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.608us        23.60%      40.608us       3.384us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.961us        14.51%      24.961us       2.080us            12  
+                                            aten::clone         1.41%      20.892us        42.46%     630.635us     105.106us       0.000us         0.00%      19.936us       3.323us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.024us         9.90%      17.024us       2.837us             6  
+                                              aten::add         2.07%      30.702us         3.51%      52.142us       8.690us      12.545us         7.29%      12.545us       2.091us             6  
+                                              aten::sub         2.41%      35.732us         4.00%      59.442us       9.907us      12.416us         7.22%      12.416us       2.069us             6  
+                                Activity Buffer Request        17.15%     254.675us        17.15%     254.675us     254.675us       2.912us         1.69%       2.912us       2.912us             1  
+                                    aten::empty_strided         2.07%      30.780us         2.07%      30.780us       5.130us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        19.36%     287.456us        19.36%     287.456us      47.909us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.32%      64.164us         5.58%      82.803us       3.450us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.26%      18.639us         1.26%      18.639us       0.777us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        14.24%     211.503us        14.24%     211.503us       4.406us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.36%       5.410us         0.36%       5.410us       5.410us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.688ms
-Self CUDA time total: 172.030us
+Self CPU time total: 1.485ms
+Self CUDA time total: 172.032us
 
 
 
@@ -4239,27 +4247,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     927.996us       768.63%     927.996us     927.996us             1  
-                                            torch_eager        20.13%     284.369us        99.65%       1.408ms       1.408ms       0.000us         0.00%     122.557us     122.557us             1  
-                                              aten::mul        10.77%     152.163us        18.72%     264.405us      11.017us      62.048us        51.39%      62.048us       2.585us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      62.048us        51.39%      62.048us       2.585us            24  
-                                            aten::copy_         7.56%     106.823us        43.43%     613.475us      34.082us      39.390us        32.63%      41.213us       2.290us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.864us        23.91%      28.864us       2.405us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.296us        15.98%      19.296us       1.608us            12  
-                                            aten::clone         1.39%      19.620us        37.04%     523.281us      87.213us       0.000us         0.00%      12.349us       2.058us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.526us         8.72%      10.526us       1.754us             6  
-                                              aten::add         2.28%      32.232us         3.86%      54.523us       9.087us       9.696us         8.03%       9.696us       1.616us             6  
-                                              aten::sub         2.48%      35.082us         4.10%      57.982us       9.664us       9.600us         7.95%       9.600us       1.600us             6  
-                                Activity Buffer Request        14.96%     211.375us        14.96%     211.375us     211.375us       1.823us         1.51%       1.823us       1.823us             1  
-                                    aten::empty_strided         2.07%      29.290us         2.07%      29.290us       4.882us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        16.27%     229.815us        16.27%     229.815us      38.302us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.68%      66.168us         5.95%      84.051us       3.502us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.27%      17.883us         1.27%      17.883us       0.745us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        15.78%     222.895us        15.78%     222.895us       4.644us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.35%       4.970us         0.35%       4.970us       4.970us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     907.735us       751.64%     907.735us     907.735us             1  
+                                            torch_eager        18.35%     272.536us        99.65%       1.480ms       1.480ms       0.000us         0.00%     122.527us     122.527us             1  
+                                              aten::mul         9.89%     146.883us        17.48%     259.553us      10.815us      62.078us        51.40%      62.078us       2.587us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      62.078us        51.40%      62.078us       2.587us            24  
+                                            aten::copy_         6.65%      98.730us        45.99%     682.885us      37.938us      39.328us        32.57%      41.088us       2.283us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.832us        23.87%      28.832us       2.403us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.361us        16.03%      19.361us       1.613us            12  
+                                            aten::clone         2.58%      38.249us        42.54%     631.763us     105.294us       0.000us         0.00%      12.256us       2.043us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.496us         8.69%      10.496us       1.749us             6  
+                                              aten::add         2.13%      31.663us         3.60%      53.483us       8.914us       9.728us         8.06%       9.728us       1.621us             6  
+                                              aten::sub         2.35%      34.954us         3.91%      58.043us       9.674us       9.633us         7.98%       9.633us       1.605us             6  
+                                Activity Buffer Request        16.88%     250.706us        16.88%     250.706us     250.706us       1.760us         1.46%       1.760us       1.760us             1  
+                                    aten::empty_strided         2.15%      31.912us         2.15%      31.912us       5.319us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        18.48%     274.437us        18.48%     274.437us      45.739us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.31%      63.964us         5.59%      83.053us       3.461us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.29%      19.089us         1.29%      19.089us       0.795us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        14.59%     216.591us        14.59%     216.591us       4.512us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.35%       5.220us         0.35%       5.220us       5.220us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.413ms
-Self CUDA time total: 120.734us
+Self CPU time total: 1.485ms
+Self CUDA time total: 120.767us
 
 
 
@@ -4269,27 +4277,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     941.367us       547.21%     941.367us     941.367us             1  
-                                            torch_eager        19.36%     280.543us        99.66%       1.444ms       1.444ms       0.000us         0.00%     174.877us     174.877us             1  
-                                              aten::mul        10.67%     154.592us        18.48%     267.677us      11.153us      89.535us        52.05%      89.535us       3.731us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      89.535us        52.05%      89.535us       3.731us            24  
-                                            aten::copy_         7.38%     106.934us        44.27%     641.329us      35.629us      57.694us        33.54%      60.542us       3.363us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.701us        23.66%      40.701us       3.392us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.800us        14.42%      24.800us       2.067us            12  
-                                            aten::clone         1.44%      20.830us        37.97%     550.103us      91.684us       0.000us         0.00%      19.841us       3.307us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.993us         9.88%      16.993us       2.832us             6  
-                                              aten::add         2.36%      34.121us         3.90%      56.522us       9.420us      12.448us         7.24%      12.448us       2.075us             6  
-                                              aten::sub         2.56%      37.161us         4.27%      61.881us      10.313us      12.352us         7.18%      12.352us       2.059us             6  
-                                Activity Buffer Request        16.20%     234.686us        16.20%     234.686us     234.686us       2.848us         1.66%       2.848us       2.848us             1  
-                                    aten::empty_strided         2.02%      29.270us         2.02%      29.270us       4.878us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        15.95%     231.027us        15.95%     231.027us      38.505us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.63%      67.091us         5.92%      85.764us       3.573us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.29%      18.673us         1.29%      18.673us       0.778us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        15.80%     228.888us        15.80%     228.888us       4.768us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.34%       4.980us         0.34%       4.980us       4.980us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     894.399us       519.81%     894.399us     894.399us             1  
+                                            torch_eager        10.51%     278.801us        99.79%       2.648ms       2.648ms       0.000us         0.00%     174.911us     174.911us             1  
+                                              aten::mul         5.47%     145.104us         9.49%     251.734us      10.489us      89.535us        52.04%      89.535us       3.731us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      89.535us        52.04%      89.535us       3.731us            24  
+                                            aten::copy_         3.73%      98.901us        70.34%       1.866ms     103.682us      57.696us        33.53%      60.544us       3.364us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.704us        23.66%      40.704us       3.392us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.832us        14.43%      24.832us       2.069us            12  
+                                            aten::clone         0.84%      22.190us        67.69%       1.796ms     299.337us       0.000us         0.00%      19.840us       3.307us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.992us         9.88%      16.992us       2.832us             6  
+                                              aten::sub         1.44%      38.162us         2.33%      61.942us      10.324us      12.448us         7.23%      12.448us       2.075us             6  
+                                              aten::add         1.15%      30.549us         1.97%      52.171us       8.695us      12.384us         7.20%      12.384us       2.064us             6  
+                                Activity Buffer Request        54.02%       1.433ms        54.02%       1.433ms       1.433ms       2.848us         1.66%       2.848us       2.848us             1  
+                                    aten::empty_strided         1.13%      30.052us         1.13%      30.052us       5.009us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        10.37%     275.065us        10.37%     275.065us      45.844us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.49%      65.991us         3.19%      84.601us       3.525us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.70%      18.610us         0.70%      18.610us       0.775us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         7.95%     211.023us         7.95%     211.023us       4.396us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.21%       5.640us         0.21%       5.640us       5.640us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.449ms
-Self CUDA time total: 172.029us
+Self CPU time total: 2.653ms
+Self CUDA time total: 172.063us
 
 
 
@@ -4299,27 +4307,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     950.141us       334.64%     950.141us     950.141us             1  
-                                            torch_eager        11.47%     310.562us        99.82%       2.702ms       2.702ms       0.000us         0.00%     302.012us     302.012us             1  
-                                              aten::mul         5.57%     150.802us         9.64%     260.955us      10.873us     133.822us        47.13%     133.822us       5.576us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     133.822us        47.13%     133.822us       5.576us            24  
-                                            aten::copy_         3.88%     105.155us        69.00%       1.868ms     103.782us     109.151us        38.44%     127.231us       7.068us            18  
-                                            aten::clone         0.99%      26.749us        66.03%       1.788ms     297.926us       0.000us         0.00%      69.886us      11.648us             6  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.345us        20.20%      57.345us       4.779us            12  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      51.806us        18.25%      51.806us       8.634us             6  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      40.959us        14.43%      40.959us       3.413us            12  
-                                              aten::sub         1.29%      34.831us         2.15%      58.172us       9.695us      20.607us         7.26%      20.607us       3.435us             6  
-                                              aten::add         1.26%      34.242us         2.11%      57.104us       9.517us      20.352us         7.17%      20.352us       3.392us             6  
-                                Activity Buffer Request        54.34%       1.471ms        54.34%       1.471ms       1.471ms      18.080us         6.37%      18.080us      18.080us             1  
-                                    aten::empty_strided         1.13%      30.492us         1.13%      30.492us       5.082us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         8.33%     225.535us         8.33%     225.535us      37.589us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.63%      71.143us         3.33%      90.164us       3.757us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.70%      19.021us         0.70%      19.021us       0.793us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.22%     222.598us         8.22%     222.598us       4.637us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.18%       4.920us         0.18%       4.920us       4.920us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     888.595us       313.84%     888.595us     888.595us             1  
+                                            torch_eager        18.64%     271.692us        99.64%       1.452ms       1.452ms       0.000us         0.00%     301.536us     301.536us             1  
+                                              aten::mul         9.98%     145.418us        17.29%     252.060us      10.503us     132.896us        46.94%     132.896us       5.537us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     132.896us        46.94%     132.896us       5.537us            24  
+                                            aten::copy_         6.89%     100.362us        46.38%     676.084us      37.560us     109.376us        38.63%     127.776us       7.099us            18  
+                                            aten::clone         1.48%      21.511us        41.22%     600.853us     100.142us       0.000us         0.00%      70.560us      11.760us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.216us        20.21%      57.216us       4.768us            12  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      52.160us        18.42%      52.160us       8.693us             6  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      40.864us        14.43%      40.864us       3.405us            12  
+                                              aten::sub         2.41%      35.143us         4.02%      58.572us       9.762us      20.512us         7.24%      20.512us       3.419us             6  
+                                              aten::add         2.12%      30.932us         3.62%      52.783us       8.797us      20.352us         7.19%      20.352us       3.392us             6  
+                                Activity Buffer Request        16.97%     247.406us        16.97%     247.406us     247.406us      18.400us         6.50%      18.400us      18.400us             1  
+                                    aten::empty_strided         2.15%      31.370us         2.15%      31.370us       5.228us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        18.35%     267.496us        18.35%     267.496us      44.583us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.85%      70.742us         6.06%      88.302us       3.679us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.20%      17.560us         1.20%      17.560us       0.732us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        14.59%     212.742us        14.59%     212.742us       4.432us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.36%       5.280us         0.36%       5.280us       5.280us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.707ms
-Self CUDA time total: 283.932us
+Self CPU time total: 1.458ms
+Self CUDA time total: 283.136us
 
 
 
@@ -4329,27 +4337,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     966.098us       169.87%     966.098us     966.098us             1  
-                                            torch_eager        20.40%     290.715us        99.64%       1.420ms       1.420ms       0.000us         0.00%     592.377us     592.377us             1  
-                                            aten::copy_         7.41%     105.615us        41.73%     594.574us      33.032us     275.293us        48.40%     298.941us      16.608us            18  
-                                              aten::mul        10.90%     155.244us        18.92%     269.648us      11.235us     227.071us        39.93%     227.071us       9.461us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     227.071us        39.93%     227.071us       9.461us            24  
-                                            aten::clone         1.44%      20.483us        35.30%     502.923us      83.821us       0.000us         0.00%     207.134us      34.522us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     183.486us        32.26%     183.486us      30.581us             6  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      91.807us        16.14%      91.807us       7.651us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      66.365us        11.67%      66.365us       5.530us            12  
-                                              aten::sub         2.66%      37.929us         4.43%      63.131us      10.522us      33.790us         5.94%      33.790us       5.632us             6  
-                                              aten::add         2.47%      35.251us         4.15%      59.172us       9.862us      32.575us         5.73%      32.575us       5.429us             6  
-                                Activity Buffer Request        13.81%     196.814us        13.81%     196.814us     196.814us      23.648us         4.16%      23.648us      23.648us             1  
-                                    aten::empty_strided         2.02%      28.790us         2.02%      28.790us       4.798us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        15.63%     222.685us        15.63%     222.685us      37.114us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         5.20%      74.092us         6.55%      93.282us       3.887us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.35%      19.190us         1.35%      19.190us       0.800us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        16.35%     232.987us        16.35%     232.987us       4.854us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.36%       5.080us         0.36%       5.080us       5.080us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     944.856us       167.33%     944.856us     944.856us             1  
+                                            torch_eager        19.10%     286.874us        99.66%       1.497ms       1.497ms       0.000us         0.00%     588.218us     588.218us             1  
+                                            aten::copy_         6.48%      97.352us        44.49%     668.224us      37.124us     273.885us        48.50%     297.437us      16.524us            18  
+                                              aten::mul        11.54%     173.280us        19.20%     288.361us      12.015us     224.990us        39.84%     224.990us       9.375us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     224.990us        39.84%     224.990us       9.375us            24  
+                                            aten::clone         1.34%      20.121us        39.51%     593.393us      98.899us       0.000us         0.00%     206.910us      34.485us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     183.358us        32.47%     183.358us      30.560us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      90.527us        16.03%      90.527us       7.544us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      65.791us        11.65%      65.791us       5.483us            12  
+                                              aten::sub         2.45%      36.872us         4.07%      61.073us      10.179us      33.407us         5.92%      33.407us       5.568us             6  
+                                              aten::add         2.13%      32.018us         3.64%      54.631us       9.105us      32.384us         5.74%      32.384us       5.397us             6  
+                                Activity Buffer Request        16.63%     249.816us        16.63%     249.816us     249.816us      23.552us         4.17%      23.552us      23.552us             1  
+                                    aten::empty_strided         2.02%      30.350us         2.02%      30.350us       5.058us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        17.28%     259.545us        17.28%     259.545us      43.258us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.52%      67.913us         5.81%      87.211us       3.634us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.28%      19.298us         1.28%      19.298us       0.804us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        14.87%     223.406us        14.87%     223.406us       4.654us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.34%       5.141us         0.34%       5.141us       5.141us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.425ms
-Self CUDA time total: 568.729us
+Self CPU time total: 1.502ms
+Self CUDA time total: 564.666us
 
 
 
@@ -4359,27 +4367,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     975.032us      1053.20%     975.032us     975.032us             1  
-                                            torch_eager        19.78%     289.798us        99.66%       1.460ms       1.460ms       0.000us         0.00%      93.698us      93.698us             1  
-                                              aten::mul        11.08%     162.260us        19.21%     281.475us      11.728us      49.665us        53.65%      49.665us       2.069us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      49.665us        53.65%      49.665us       2.069us            24  
-                                            aten::copy_         7.16%     104.830us        42.02%     615.673us      34.204us      29.441us        31.80%      30.561us       1.698us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.657us        24.47%      22.657us       1.888us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.472us        14.55%      13.472us       1.123us            12  
-                                            aten::clone         1.39%      20.311us        36.25%     531.032us      88.505us       0.000us         0.00%       7.904us       1.317us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.784us         7.33%       6.784us       1.131us             6  
-                                              aten::add         2.30%      33.730us         3.98%      58.302us       9.717us       6.752us         7.29%       6.752us       1.125us             6  
-                                              aten::sub         2.57%      37.640us         4.45%      65.262us      10.877us       6.720us         7.26%       6.720us       1.120us             6  
-                                Activity Buffer Request        14.75%     216.135us        14.75%     216.135us     216.135us       1.120us         1.21%       1.120us       1.120us             1  
-                                    aten::empty_strided         2.59%      37.931us         2.59%      37.931us       6.322us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        15.29%     223.986us        15.29%     223.986us      37.331us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.89%      71.623us         6.23%      91.274us       3.803us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.34%      19.651us         1.34%      19.651us       0.819us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        16.53%     242.131us        16.53%     242.131us       5.044us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.34%       5.040us         0.34%       5.040us       5.040us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     916.448us       990.96%     916.448us     916.448us             1  
+                                            torch_eager        10.63%     281.892us        99.80%       2.647ms       2.647ms       0.000us         0.00%      93.601us      93.601us             1  
+                                              aten::mul         5.58%     148.028us         9.67%     256.571us      10.690us      49.634us        53.67%      49.634us       2.068us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      49.634us        53.67%      49.634us       2.068us            24  
+                                            aten::copy_         3.99%     105.971us        69.88%       1.854ms     102.991us      29.439us        31.83%      30.559us       1.698us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.655us        24.50%      22.655us       1.888us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.408us        14.50%      13.408us       1.117us            12  
+                                            aten::clone         0.82%      21.802us        66.79%       1.772ms     295.325us       0.000us         0.00%       7.904us       1.317us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.784us         7.34%       6.784us       1.131us             6  
+                                              aten::sub         1.36%      36.061us         2.24%      59.441us       9.907us       6.720us         7.27%       6.720us       1.120us             6  
+                                              aten::add         1.25%      33.260us         2.10%      55.590us       9.265us       6.688us         7.23%       6.688us       1.115us             6  
+                                Activity Buffer Request        54.00%       1.433ms        54.00%       1.433ms       1.433ms       1.120us         1.21%       1.120us       1.120us             1  
+                                    aten::empty_strided         1.13%      29.861us         1.13%      29.861us       4.977us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         9.52%     252.488us         9.52%     252.488us      42.081us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.59%      68.801us         3.33%      88.471us       3.686us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.74%      19.670us         0.74%      19.670us       0.820us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.18%     216.965us         8.18%     216.965us       4.520us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.20%       5.410us         0.20%       5.410us       5.410us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.465ms
-Self CUDA time total: 92.578us
+Self CPU time total: 2.653ms
+Self CUDA time total: 92.481us
 
 
 
@@ -4389,27 +4397,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     963.643us      1001.81%     963.643us     963.643us             1  
-                                            torch_eager        11.60%     311.071us        99.82%       2.676ms       2.676ms       0.000us         0.00%      97.534us      97.534us             1  
-                                              aten::mul         5.66%     151.593us        10.00%     268.127us      11.172us      51.103us        53.13%      51.103us       2.129us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      51.103us        53.13%      51.103us       2.129us            24  
-                                            aten::copy_         3.93%     105.441us        68.13%       1.826ms     101.459us      30.911us        32.14%      32.255us       1.792us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      23.007us        23.92%      23.007us       1.917us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.176us        14.74%      14.176us       1.181us            12  
-                                            aten::clone         1.04%      27.830us        65.21%       1.748ms     291.325us       0.000us         0.00%       9.248us       1.541us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.904us         8.22%       7.904us       1.317us             6  
-                                              aten::sub         1.38%      37.040us         2.30%      61.581us      10.264us       7.103us         7.38%       7.103us       1.184us             6  
-                                              aten::add         1.19%      32.000us         2.05%      54.860us       9.143us       7.073us         7.35%       7.073us       1.179us             6  
-                                Activity Buffer Request        53.57%       1.436ms        53.57%       1.436ms       1.436ms       1.344us         1.40%       1.344us       1.344us             1  
-                                    aten::empty_strided         1.19%      31.921us         1.19%      31.921us       5.320us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         8.14%     218.236us         8.14%     218.236us      36.373us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.76%      74.059us         3.52%      94.290us       3.929us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.75%      20.231us         0.75%      20.231us       0.843us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.60%     230.408us         8.60%     230.408us       4.800us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.18%       4.700us         0.18%       4.700us       4.700us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     888.809us       923.35%     888.809us     888.809us             1  
+                                            torch_eager        19.05%     273.129us        99.67%       1.429ms       1.429ms       0.000us         0.00%      97.571us      97.571us             1  
+                                              aten::mul        10.09%     144.695us        17.61%     252.506us      10.521us      51.232us        53.22%      51.232us       2.135us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      51.232us        53.22%      51.232us       2.135us            24  
+                                            aten::copy_         6.72%      96.301us        45.37%     650.385us      36.132us      30.786us        31.98%      32.098us       1.783us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.944us        23.84%      22.944us       1.912us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.241us        14.79%      14.241us       1.187us            12  
+                                            aten::clone         1.39%      19.911us        40.43%     579.513us      96.586us       0.000us         0.00%       9.154us       1.526us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.842us         8.15%       7.842us       1.307us             6  
+                                              aten::add         2.26%      32.360us         3.79%      54.320us       9.053us       7.136us         7.41%       7.136us       1.189us             6  
+                                              aten::sub         2.55%      36.551us         4.17%      59.791us       9.965us       7.105us         7.38%       7.105us       1.184us             6  
+                                Activity Buffer Request        16.56%     237.415us        16.56%     237.415us     237.415us       1.312us         1.36%       1.312us       1.312us             1  
+                                    aten::empty_strided         2.18%      31.230us         2.18%      31.230us       5.205us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        17.96%     257.447us        17.96%     257.447us      42.908us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.71%      67.539us         6.11%      87.581us       3.649us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.40%      20.042us         1.40%      20.042us       0.835us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        14.80%     212.233us        14.80%     212.233us       4.422us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.33%       4.690us         0.33%       4.690us       4.690us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.681ms
-Self CUDA time total: 96.190us
+Self CPU time total: 1.434ms
+Self CUDA time total: 96.259us
 
 
 
@@ -4419,27 +4427,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     984.120us       950.08%     984.120us     984.120us             1  
-                                            torch_eager        21.32%     307.609us        99.66%       1.438ms       1.438ms       0.000us         0.00%     104.863us     104.863us             1  
-                                              aten::mul        11.11%     160.241us        19.03%     274.535us      11.439us      55.232us        53.32%      55.232us       2.301us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      55.232us        53.32%      55.232us       2.301us            24  
-                                            aten::copy_         7.56%     109.063us        40.34%     581.983us      32.332us      32.383us        31.26%      33.663us       1.870us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.639us        23.79%      24.639us       2.053us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      15.968us        15.42%      15.968us       1.331us            12  
-                                            aten::clone         1.50%      21.672us        34.18%     493.044us      82.174us       0.000us         0.00%       9.024us       1.504us             6  
-                                              aten::add         2.60%      37.520us         4.33%      62.511us      10.418us       8.031us         7.75%       8.031us       1.339us             6  
-                                              aten::sub         2.72%      39.231us         4.56%      65.841us      10.973us       7.937us         7.66%       7.937us       1.323us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.744us         7.48%       7.744us       1.291us             6  
-                                Activity Buffer Request        13.05%     188.244us        13.05%     188.244us     188.244us       1.280us         1.24%       1.280us       1.280us             1  
-                                    aten::empty_strided         2.28%      32.882us         2.28%      32.882us       5.480us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        14.94%     215.555us        14.94%     215.555us      35.926us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.93%      71.162us         6.28%      90.612us       3.776us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.35%      19.450us         1.35%      19.450us       0.810us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        16.29%     235.016us        16.29%     235.016us       4.896us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.34%       4.880us         0.34%       4.880us       4.880us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     903.536us       870.95%     903.536us     903.536us             1  
+                                            torch_eager        18.87%     271.956us        99.65%       1.436ms       1.436ms       0.000us         0.00%     105.053us     105.053us             1  
+                                              aten::mul        10.20%     146.935us        17.83%     256.897us      10.704us      55.262us        53.27%      55.262us       2.303us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      55.262us        53.27%      55.262us       2.303us            24  
+                                            aten::copy_         6.83%      98.437us        45.05%     649.198us      36.067us      32.478us        31.31%      33.790us       1.877us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.670us        23.78%      24.670us       2.056us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.001us        15.42%      16.001us       1.333us            12  
+                                            aten::clone         1.50%      21.580us        40.06%     577.333us      96.222us       0.000us         0.00%       9.120us       1.520us             6  
+                                              aten::sub         2.49%      35.841us         4.72%      67.992us      11.332us       8.001us         7.71%       8.001us       1.333us             6  
+                                              aten::add         2.31%      33.350us         3.86%      55.670us       9.278us       8.000us         7.71%       8.000us       1.333us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.808us         7.53%       7.808us       1.301us             6  
+                                Activity Buffer Request        16.46%     237.265us        16.46%     237.265us     237.265us       1.312us         1.26%       1.312us       1.312us             1  
+                                    aten::empty_strided         2.16%      31.090us         2.16%      31.090us       5.182us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        17.50%     252.196us        17.50%     252.196us      42.033us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.40%      63.461us         5.67%      81.650us       3.402us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.26%      18.189us         1.26%      18.189us       0.758us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.66%     225.733us        15.66%     225.733us       4.703us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.35%       5.060us         0.35%       5.060us       5.060us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.443ms
-Self CUDA time total: 103.583us
+Self CPU time total: 1.441ms
+Self CUDA time total: 103.741us
 
 
 
@@ -4449,27 +4457,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     935.122us       757.84%     935.122us     935.122us             1  
-                                            torch_eager        19.99%     283.519us        99.60%       1.412ms       1.412ms       0.000us         0.00%     125.153us     125.153us             1  
-                                              aten::mul        10.97%     155.634us        18.77%     266.135us      11.089us      65.024us        52.70%      65.024us       2.709us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      65.024us        52.70%      65.024us       2.709us            24  
-                                            aten::copy_         7.53%     106.809us        43.10%     611.203us      33.956us      39.201us        31.77%      40.961us       2.276us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.769us        23.31%      28.769us       2.397us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.168us        15.53%      19.168us       1.597us            12  
-                                            aten::clone         1.50%      21.262us        37.00%     524.722us      87.454us       0.000us         0.00%      12.192us       2.032us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.432us         8.45%      10.432us       1.739us             6  
-                                              aten::add         2.41%      34.151us         3.94%      55.922us       9.320us       9.664us         7.83%       9.664us       1.611us             6  
-                                              aten::sub         2.49%      35.371us         4.21%      59.711us       9.952us       9.504us         7.70%       9.504us       1.584us             6  
-                                Activity Buffer Request        14.55%     206.375us        14.55%     206.375us     206.375us       1.760us         1.43%       1.760us       1.760us             1  
-                                    aten::empty_strided         2.12%      30.049us         2.12%      30.049us       5.008us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        16.20%     229.735us        16.20%     229.735us      38.289us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.63%      65.693us         5.97%      84.623us       3.526us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.33%      18.930us         1.33%      18.930us       0.789us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        15.86%     224.896us        15.86%     224.896us       4.685us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.40%       5.729us         0.40%       5.729us       5.729us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     903.548us       729.80%     903.548us     903.548us             1  
+                                            torch_eager        10.56%     280.674us        99.81%       2.652ms       2.652ms       0.000us         0.00%     125.567us     125.567us             1  
+                                              aten::mul         5.49%     145.805us         9.46%     251.467us      10.478us      65.184us        52.65%      65.184us       2.716us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      65.184us        52.65%      65.184us       2.716us            24  
+                                            aten::copy_         3.75%      99.563us        70.08%       1.862ms     103.468us      39.422us        31.84%      41.182us       2.288us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.928us        23.37%      28.928us       2.411us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.201us        15.51%      19.201us       1.600us            12  
+                                            aten::clone         0.92%      24.379us        67.48%       1.793ms     298.872us       0.000us         0.00%      12.254us       2.042us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.494us         8.48%      10.494us       1.749us             6  
+                                              aten::add         1.15%      30.622us         1.96%      52.162us       8.694us       9.633us         7.78%       9.633us       1.606us             6  
+                                              aten::sub         1.45%      38.422us         2.36%      62.661us      10.443us       9.568us         7.73%       9.568us       1.595us             6  
+                                Activity Buffer Request        54.94%       1.460ms        54.94%       1.460ms       1.460ms       1.760us         1.42%       1.760us       1.760us             1  
+                                    aten::empty_strided         1.16%      30.801us         1.16%      30.801us       5.133us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         9.14%     242.866us         9.14%     242.866us      40.478us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.56%      67.990us         3.30%      87.783us       3.658us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.74%      19.793us         0.74%      19.793us       0.825us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         7.96%     211.432us         7.96%     211.432us       4.405us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.19%       5.160us         0.19%       5.160us       5.160us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.418ms
-Self CUDA time total: 123.393us
+Self CPU time total: 2.658ms
+Self CUDA time total: 123.807us
 
 
 
@@ -4479,27 +4487,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     964.101us       931.86%     964.101us     964.101us             1  
-                                            torch_eager        11.58%     311.269us        99.80%       2.682ms       2.682ms       0.000us         0.00%     104.772us     104.772us             1  
-                                              aten::mul         5.74%     154.165us         9.94%     267.067us      11.128us      55.236us        53.39%      55.236us       2.301us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      55.236us        53.39%      55.236us       2.301us            24  
-                                            aten::copy_         4.07%     109.351us        68.30%       1.836ms     101.989us      32.287us        31.21%      33.599us       1.867us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.511us        23.69%      24.511us       2.043us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      15.937us        15.40%      15.937us       1.328us            12  
-                                            aten::clone         1.02%      27.532us        65.06%       1.749ms     291.482us       0.000us         0.00%       9.088us       1.515us             6  
-                                              aten::add         1.31%      35.310us         2.20%      59.141us       9.857us       7.969us         7.70%       7.969us       1.328us             6  
-                                              aten::sub         1.38%      37.131us         2.33%      62.602us      10.434us       7.968us         7.70%       7.968us       1.328us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.776us         7.52%       7.776us       1.296us             6  
-                                Activity Buffer Request        53.54%       1.439ms        53.54%       1.439ms       1.439ms       1.312us         1.27%       1.312us       1.312us             1  
-                                    aten::empty_strided         1.12%      30.190us         1.12%      30.190us       5.032us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         8.09%     217.335us         8.09%     217.335us      36.223us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.62%      70.291us         3.31%      88.901us       3.704us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.69%      18.610us         0.69%      18.610us       0.775us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.64%     232.137us         8.64%     232.137us       4.836us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.20%       5.481us         0.20%       5.481us       5.481us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     889.436us       855.74%     889.436us     889.436us             1  
+                                            torch_eager        19.42%     274.045us        99.59%       1.406ms       1.406ms       0.000us         0.00%     105.282us     105.282us             1  
+                                              aten::mul        10.41%     146.921us        18.18%     256.563us      10.690us      55.486us        53.38%      55.486us       2.312us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      55.486us        53.38%      55.486us       2.312us            24  
+                                            aten::copy_         6.82%      96.302us        44.56%     628.895us      34.939us      32.513us        31.28%      33.857us       1.881us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.705us        23.77%      24.705us       2.059us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      15.939us        15.34%      15.939us       1.328us            12  
+                                            aten::clone         1.41%      19.928us        39.46%     556.871us      92.812us       0.000us         0.00%       9.152us       1.525us             6  
+                                              aten::sub         2.56%      36.082us         4.16%      58.744us       9.791us       7.970us         7.67%       7.970us       1.328us             6  
+                                              aten::add         2.23%      31.511us         3.85%      54.282us       9.047us       7.969us         7.67%       7.969us       1.328us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.808us         7.51%       7.808us       1.301us             6  
+                                Activity Buffer Request        15.99%     225.676us        15.99%     225.676us     225.676us       1.344us         1.29%       1.344us       1.344us             1  
+                                    aten::empty_strided         2.17%      30.631us         2.17%      30.631us       5.105us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        17.52%     247.335us        17.52%     247.335us      41.223us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.52%      63.850us         5.84%      82.475us       3.436us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.32%      18.625us         1.32%      18.625us       0.776us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.21%     214.657us        15.21%     214.657us       4.472us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.41%       5.810us         0.41%       5.810us       5.810us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.688ms
-Self CUDA time total: 103.460us
+Self CPU time total: 1.411ms
+Self CUDA time total: 103.938us
 
 
 
@@ -4509,27 +4517,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     964.072us       780.68%     964.072us     964.072us             1  
-                                            torch_eager        11.45%     316.268us        99.81%       2.758ms       2.758ms       0.000us         0.00%     125.283us     125.283us             1  
-                                              aten::mul         5.46%     150.776us         9.46%     261.336us      10.889us      65.090us        52.71%      65.090us       2.712us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      65.090us        52.71%      65.090us       2.712us            24  
-                                            aten::copy_         3.85%     106.511us        68.83%       1.902ms     105.647us      39.266us        31.80%      41.058us       2.281us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.802us        23.32%      28.802us       2.400us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.135us        15.50%      19.135us       1.595us            12  
-                                            aten::clone         1.09%      30.231us        66.11%       1.827ms     304.441us       0.000us         0.00%      12.256us       2.043us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.464us         8.47%      10.464us       1.744us             6  
-                                              aten::add         1.22%      33.650us         2.08%      57.431us       9.572us       9.599us         7.77%       9.599us       1.600us             6  
-                                              aten::sub         1.35%      37.292us         2.48%      68.652us      11.442us       9.536us         7.72%       9.536us       1.589us             6  
-                                Activity Buffer Request        54.53%       1.507ms        54.53%       1.507ms       1.507ms       1.792us         1.45%       1.792us       1.792us             1  
-                                    aten::empty_strided         1.19%      32.821us         1.19%      32.821us       5.470us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         8.01%     221.424us         8.01%     221.424us      36.904us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.55%      70.592us         3.23%      89.363us       3.723us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.68%      18.771us         0.68%      18.771us       0.782us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.42%     232.664us         8.42%     232.664us       4.847us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.19%       5.190us         0.19%       5.190us       5.190us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     888.135us       717.15%     888.135us     888.135us             1  
+                                            torch_eager        18.91%     268.465us        99.65%       1.415ms       1.415ms       0.000us         0.00%     125.666us     125.666us             1  
+                                              aten::mul        10.15%     144.114us        17.70%     251.265us      10.469us      65.346us        52.77%      65.346us       2.723us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      65.346us        52.77%      65.346us       2.723us            24  
+                                            aten::copy_         6.90%      97.992us        45.41%     644.725us      35.818us      39.328us        31.76%      41.152us       2.286us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.800us        23.26%      28.800us       2.400us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.168us        15.48%      19.168us       1.597us            12  
+                                            aten::clone         1.46%      20.690us        40.33%     572.532us      95.422us       0.000us         0.00%      12.352us       2.059us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.528us         8.50%      10.528us       1.755us             6  
+                                              aten::add         2.19%      31.029us         3.69%      52.390us       8.732us       9.600us         7.75%       9.600us       1.600us             6  
+                                              aten::sub         2.50%      35.469us         4.13%      58.580us       9.763us       9.568us         7.73%       9.568us       1.595us             6  
+                                Activity Buffer Request        15.69%     222.765us        15.69%     222.765us     222.765us       1.824us         1.47%       1.824us       1.824us             1  
+                                    aten::empty_strided         2.29%      32.500us         2.29%      32.500us       5.417us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        18.50%     262.716us        18.50%     262.716us      43.786us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.70%      66.710us         6.07%      86.108us       3.588us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.37%      19.398us         1.37%      19.398us       0.808us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        14.99%     212.875us        14.99%     212.875us       4.435us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.35%       5.010us         0.35%       5.010us       5.010us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.763ms
-Self CUDA time total: 123.491us
+Self CPU time total: 1.420ms
+Self CUDA time total: 123.842us
 
 
 
@@ -4539,27 +4547,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     934.855us       527.63%     934.855us     934.855us             1  
-                                            torch_eager        19.51%     283.728us        99.66%       1.450ms       1.450ms       0.000us         0.00%     180.061us     180.061us             1  
-                                              aten::mul        10.43%     151.748us        18.10%     263.338us      10.972us      95.007us        53.62%      95.007us       3.959us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      95.007us        53.62%      95.007us       3.959us            24  
-                                            aten::copy_         7.11%     103.461us        44.35%     645.065us      35.837us      57.664us        32.55%      60.544us       3.364us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.608us        22.92%      40.608us       3.384us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.510us        13.83%      24.510us       2.042us            12  
-                                            aten::clone         1.46%      21.280us        38.39%     558.424us      93.071us       0.000us         0.00%      19.936us       3.323us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.056us         9.63%      17.056us       2.843us             6  
-                                              aten::add         2.36%      34.271us         3.99%      58.001us       9.667us      12.287us         6.93%      12.287us       2.048us             6  
-                                              aten::sub         2.55%      37.161us         4.24%      61.641us      10.274us      12.223us         6.90%      12.223us       2.037us             6  
-                                Activity Buffer Request        17.53%     255.006us        17.53%     255.006us     255.006us       2.880us         1.63%       2.880us       2.880us             1  
-                                    aten::empty_strided         2.02%      29.311us         2.02%      29.311us       4.885us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        15.21%     221.267us        15.21%     221.267us      36.878us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.73%      68.750us         6.01%      87.372us       3.641us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.28%      18.622us         1.28%      18.622us       0.776us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        15.48%     225.131us        15.48%     225.131us       4.690us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.34%       4.880us         0.34%       4.880us       4.880us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     910.045us       513.35%     910.045us     910.045us             1  
+                                            torch_eager         9.66%     280.213us        99.83%       2.894ms       2.894ms       0.000us         0.00%     180.188us     180.188us             1  
+                                              aten::mul         5.18%     150.102us         9.00%     260.863us      10.869us      94.655us        53.39%      94.655us       3.944us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      94.655us        53.39%      94.655us       3.944us            24  
+                                            aten::copy_         3.40%      98.673us        72.45%       2.101ms     116.706us      57.885us        32.65%      60.797us       3.378us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.799us        23.01%      40.799us       3.400us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.736us        13.95%      24.736us       2.061us            12  
+                                            aten::clone         0.79%      22.860us        70.00%       2.030ms     338.262us       0.000us         0.00%      19.998us       3.333us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.086us         9.64%      17.086us       2.848us             6  
+                                              aten::add         1.13%      32.880us         1.89%      54.761us       9.127us      12.416us         7.00%      12.416us       2.069us             6  
+                                              aten::sub         1.18%      34.239us         1.98%      57.551us       9.592us      12.320us         6.95%      12.320us       2.053us             6  
+                                Activity Buffer Request        58.76%       1.704ms        58.76%       1.704ms       1.704ms       2.912us         1.64%       2.912us       2.912us             1  
+                                    aten::empty_strided         1.11%      32.150us         1.11%      32.150us       5.358us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.21%     238.144us         8.21%     238.144us      39.691us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.29%      66.481us         2.94%      85.213us       3.551us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.65%      18.732us         0.65%      18.732us       0.781us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         7.46%     216.224us         7.46%     216.224us       4.505us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.17%       5.070us         0.17%       5.070us       5.070us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.455ms
-Self CUDA time total: 177.181us
+Self CPU time total: 2.899ms
+Self CUDA time total: 177.276us
 
 
 
@@ -4569,27 +4577,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     936.902us       314.34%     936.902us     936.902us             1  
-                                            torch_eager        19.95%     279.505us        99.63%       1.396ms       1.396ms       0.000us         0.00%     315.267us     315.267us             1  
-                                              aten::mul        10.85%     152.079us        18.94%     265.395us      11.058us     146.176us        49.04%     146.176us       6.091us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     146.176us        49.04%     146.176us       6.091us            24  
-                                            aten::copy_         7.66%     107.385us        42.60%     596.937us      33.163us     110.978us        37.23%     128.194us       7.122us            18  
-                                            aten::clone         1.45%      20.319us        36.31%     508.783us      84.797us       0.000us         0.00%      70.625us      11.771us             6  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.569us        19.32%      57.569us       4.797us            12  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.409us        17.92%      53.409us       8.902us             6  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      40.897us        13.72%      40.897us       3.408us            12  
-                                              aten::sub         2.61%      36.531us         4.38%      61.402us      10.234us      20.449us         6.86%      20.449us       3.408us             6  
-                                              aten::add         2.39%      33.533us         3.98%      55.753us       9.292us      20.448us         6.86%      20.448us       3.408us             6  
-                                Activity Buffer Request        14.75%     206.705us        14.75%     206.705us     206.705us      17.216us         5.78%      17.216us      17.216us             1  
-                                    aten::empty_strided         2.13%      29.842us         2.13%      29.842us       4.974us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        15.44%     216.385us        15.44%     216.385us      36.064us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.91%      68.874us         6.21%      87.042us       3.627us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.30%      18.168us         1.30%      18.168us       0.757us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        16.19%     226.869us        16.19%     226.869us       4.726us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.37%       5.161us         0.37%       5.161us       5.161us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     919.612us       310.44%     919.612us     919.612us             1  
+                                            torch_eager        10.49%     286.464us        99.82%       2.726ms       2.726ms       0.000us         0.00%     313.057us     313.057us             1  
+                                              aten::mul         5.34%     145.716us         9.29%     253.789us      10.575us     145.182us        49.01%     145.182us       6.049us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     145.182us        49.01%     145.182us       6.049us            24  
+                                            aten::copy_         3.69%     100.696us        70.60%       1.928ms     107.115us     109.985us        37.13%     126.817us       7.045us            18  
+                                            aten::clone         0.88%      23.951us        68.02%       1.858ms     309.597us       0.000us         0.00%      69.474us      11.579us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.343us        19.36%      57.343us       4.779us            12  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      52.642us        17.77%      52.642us       8.774us             6  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      41.058us        13.86%      41.058us       3.421us            12  
+                                              aten::sub         1.33%      36.191us         2.18%      59.621us       9.937us      20.609us         6.96%      20.609us       3.435us             6  
+                                              aten::add         1.14%      31.230us         1.95%      53.190us       8.865us      20.449us         6.90%      20.449us       3.408us             6  
+                                Activity Buffer Request        56.07%       1.531ms        56.07%       1.531ms       1.531ms      16.832us         5.68%      16.832us      16.832us             1  
+                                    aten::empty_strided         1.17%      32.070us         1.17%      32.070us       5.345us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.59%     234.696us         8.59%     234.696us      39.116us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.53%      69.062us         3.26%      88.922us       3.705us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.73%      19.860us         0.73%      19.860us       0.827us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         7.86%     214.752us         7.86%     214.752us       4.474us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.18%       4.930us         0.18%       4.930us       4.930us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.401ms
-Self CUDA time total: 298.051us
+Self CPU time total: 2.731ms
+Self CUDA time total: 296.225us
 
 
 
@@ -4599,27 +4607,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     953.069us       538.57%     953.069us     953.069us             1  
-                                            torch_eager        19.36%     280.983us        99.62%       1.446ms       1.446ms       0.000us         0.00%     179.812us     179.812us             1  
-                                              aten::mul        10.74%     155.876us        18.65%     270.688us      11.279us      94.916us        53.64%      94.916us       3.955us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      94.916us        53.64%      94.916us       3.955us            24  
-                                            aten::copy_         7.70%     111.823us        43.62%     633.117us      35.173us      57.568us        32.53%      60.416us       3.356us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.544us        22.91%      40.544us       3.379us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.480us        13.83%      24.480us       2.040us            12  
-                                            aten::clone         1.50%      21.731us        37.58%     545.384us      90.897us       0.000us         0.00%      19.872us       3.312us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.024us         9.62%      17.024us       2.837us             6  
-                                              aten::add         2.38%      34.509us         4.05%      58.781us       9.797us      12.256us         6.93%      12.256us       2.043us             6  
-                                              aten::sub         2.51%      36.442us         4.13%      59.923us       9.987us      12.224us         6.91%      12.224us       2.037us             6  
-                                Activity Buffer Request        15.40%     223.485us        15.40%     223.485us     223.485us       2.848us         1.61%       2.848us       2.848us             1  
-                                    aten::empty_strided         2.13%      30.930us         2.13%      30.930us       5.155us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        15.79%     229.197us        15.79%     229.197us      38.200us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.88%      70.882us         6.18%      89.652us       3.735us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.29%      18.770us         1.29%      18.770us       0.782us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        15.93%     231.177us        15.93%     231.177us       4.816us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.38%       5.510us         0.38%       5.510us       5.510us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     889.394us       501.79%     889.394us     889.394us             1  
+                                            torch_eager        17.97%     266.975us        99.65%       1.481ms       1.481ms       0.000us         0.00%     180.092us     180.092us             1  
+                                              aten::mul         9.80%     145.611us        16.96%     251.937us      10.497us      94.974us        53.58%      94.974us       3.957us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      94.974us        53.58%      94.974us       3.957us            24  
+                                            aten::copy_         6.75%     100.282us        47.98%     712.837us      39.602us      57.694us        32.55%      60.542us       3.363us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.736us        22.98%      40.736us       3.395us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.576us        13.87%      24.576us       2.048us            12  
+                                            aten::clone         1.38%      20.549us        43.06%     639.725us     106.621us       0.000us         0.00%      19.806us       3.301us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.958us         9.57%      16.958us       2.826us             6  
+                                              aten::sub         2.49%      37.040us         4.14%      61.531us      10.255us      12.289us         6.93%      12.289us       2.048us             6  
+                                              aten::add         2.11%      31.282us         3.59%      53.402us       8.900us      12.287us         6.93%      12.287us       2.048us             6  
+                                Activity Buffer Request        19.87%     295.257us        19.87%     295.257us     295.257us       2.848us         1.61%       2.848us       2.848us             1  
+                                    aten::empty_strided         2.04%      30.372us         2.04%      30.372us       5.062us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        17.34%     257.637us        17.34%     257.637us      42.940us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.31%      64.000us         5.58%      82.951us       3.456us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.28%      18.951us         1.28%      18.951us       0.790us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        14.31%     212.598us        14.31%     212.598us       4.429us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.35%       5.130us         0.35%       5.130us       5.130us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.451ms
-Self CUDA time total: 176.964us
+Self CPU time total: 1.486ms
+Self CUDA time total: 177.244us
 
 
 
@@ -4629,27 +4637,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     992.756us       332.77%     992.756us     992.756us             1  
-                                            torch_eager        20.12%     289.006us        99.66%       1.432ms       1.432ms       0.000us         0.00%     316.222us     316.222us             1  
-                                              aten::mul        11.31%     162.528us        19.47%     279.759us      11.657us     146.880us        49.23%     146.880us       6.120us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     146.880us        49.23%     146.880us       6.120us            24  
-                                            aten::copy_         7.73%     111.012us        41.48%     595.895us      33.105us     110.942us        37.19%     128.830us       7.157us            18  
-                                            aten::clone         1.55%      22.310us        35.21%     505.793us      84.299us       0.000us         0.00%      71.424us      11.904us             6  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.406us        19.24%      57.406us       4.784us            12  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.536us        17.94%      53.536us       8.923us             6  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      40.512us        13.58%      40.512us       3.376us            12  
-                                              aten::add         2.53%      36.289us         4.25%      61.011us      10.169us      20.352us         6.82%      20.352us       3.392us             6  
-                                              aten::sub         2.59%      37.162us         4.41%      63.291us      10.549us      20.160us         6.76%      20.160us       3.360us             6  
-                                Activity Buffer Request        13.10%     188.164us        13.10%     188.164us     188.164us      17.888us         6.00%      17.888us      17.888us             1  
-                                    aten::empty_strided         2.24%      32.121us         2.24%      32.121us       5.354us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        15.74%     226.067us        15.74%     226.067us      37.678us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.81%      69.111us         6.15%      88.363us       3.682us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.34%      19.252us         1.34%      19.252us       0.802us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        16.62%     238.734us        16.62%     238.734us       4.974us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.34%       4.940us         0.34%       4.940us       4.940us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     909.016us       306.24%     909.016us     909.016us             1  
+                                            torch_eager        19.05%     269.264us        99.66%       1.409ms       1.409ms       0.000us         0.00%     314.684us     314.684us             1  
+                                              aten::mul        10.56%     149.323us        19.02%     268.875us      11.203us     145.440us        49.00%     145.440us       6.060us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     145.440us        49.00%     145.440us       6.060us            24  
+                                            aten::copy_         6.96%      98.305us        44.09%     623.125us      34.618us     110.751us        37.31%     128.606us       7.145us            18  
+                                            aten::clone         1.45%      20.520us        38.80%     548.422us      91.404us       0.000us         0.00%      71.453us      11.909us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.153us        19.25%      57.153us       4.763us            12  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.598us        18.06%      53.598us       8.933us             6  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      40.638us        13.69%      40.638us       3.387us            12  
+                                              aten::add         2.27%      32.070us         3.85%      54.390us       9.065us      20.352us         6.86%      20.352us       3.392us             6  
+                                              aten::sub         2.35%      33.277us         4.05%      57.282us       9.547us      20.286us         6.83%      20.286us       3.381us             6  
+                                Activity Buffer Request        15.96%     225.655us        15.96%     225.655us     225.655us      17.855us         6.02%      17.855us      17.855us             1  
+                                    aten::empty_strided         2.15%      30.350us         2.15%      30.350us       5.058us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        16.79%     237.294us        16.79%     237.294us      39.549us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.69%      66.249us         6.00%      84.797us       3.533us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.31%      18.548us         1.31%      18.548us       0.773us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        16.11%     227.748us        16.11%     227.748us       4.745us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.34%       4.840us         0.34%       4.840us       4.840us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.437ms
-Self CUDA time total: 298.334us
+Self CPU time total: 1.413ms
+Self CUDA time total: 296.829us
 
 
 
@@ -4659,27 +4667,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     957.657us       163.29%     957.657us     957.657us             1  
-                                            torch_eager        20.09%     288.813us        99.63%       1.432ms       1.432ms       0.000us         0.00%     610.425us     610.425us             1  
-                                            aten::copy_         7.31%     105.011us        42.63%     612.724us      34.040us     268.572us        45.79%     292.508us      16.250us            18  
-                                              aten::mul        10.71%     153.870us        18.84%     270.776us      11.282us     252.607us        43.07%     252.607us      10.525us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     252.607us        43.07%     252.607us      10.525us            24  
-                                            aten::clone         1.42%      20.480us        36.58%     525.692us      87.615us       0.000us         0.00%     201.566us      33.594us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     177.630us        30.29%     177.630us      29.605us             6  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      90.942us        15.51%      90.942us       7.578us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      65.310us        11.14%      65.310us       5.443us            12  
-                                              aten::sub         2.69%      38.720us         4.45%      63.991us      10.665us      32.991us         5.63%      32.991us       5.499us             6  
-                                              aten::add         2.37%      34.041us         3.93%      56.461us       9.410us      32.319us         5.51%      32.319us       5.387us             6  
-                                Activity Buffer Request        15.99%     229.866us        15.99%     229.866us     229.866us      23.936us         4.08%      23.936us      23.936us             1  
-                                    aten::empty_strided         2.02%      29.010us         2.02%      29.010us       4.835us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        14.72%     211.585us        14.72%     211.585us      35.264us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.83%      69.478us         6.24%      89.671us       3.736us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.40%      20.193us         1.40%      20.193us       0.841us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        16.06%     230.859us        16.06%     230.859us       4.810us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.37%       5.320us         0.37%       5.320us       5.320us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     916.757us       157.09%     916.757us     916.757us             1  
+                                            torch_eager        19.46%     274.242us        99.65%       1.404ms       1.404ms       0.000us         0.00%     607.350us     607.350us             1  
+                                            aten::copy_         7.01%      98.793us        43.42%     611.905us      33.995us     268.603us        46.03%     292.379us      16.243us            18  
+                                              aten::mul        10.57%     148.926us        18.84%     265.480us      11.062us     249.086us        42.68%     249.086us      10.379us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     249.086us        42.68%     249.086us      10.379us            24  
+                                            aten::clone         1.44%      20.340us        38.12%     537.253us      89.542us       0.000us         0.00%     202.173us      33.696us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     178.397us        30.57%     178.397us      29.733us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      90.206us        15.46%      90.206us       7.517us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      65.885us        11.29%      65.885us       5.490us            12  
+                                              aten::sub         2.63%      37.022us         4.37%      61.602us      10.267us      33.151us         5.68%      33.151us       5.525us             6  
+                                              aten::add         2.33%      32.810us         3.92%      55.180us       9.197us      32.734us         5.61%      32.734us       5.456us             6  
+                                Activity Buffer Request        15.58%     219.605us        15.58%     219.605us     219.605us      23.776us         4.07%      23.776us      23.776us             1  
+                                    aten::empty_strided         2.10%      29.631us         2.10%      29.631us       4.938us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        16.49%     232.396us        16.49%     232.396us      38.733us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.73%      66.612us         6.10%      85.953us       3.581us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.37%      19.341us         1.37%      19.341us       0.806us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.94%     224.615us        15.94%     224.615us       4.679us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.35%       4.910us         0.35%       4.910us       4.910us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.437ms
-Self CUDA time total: 586.489us
+Self CPU time total: 1.409ms
+Self CUDA time total: 583.574us
 
 
 
@@ -4689,55 +4697,61 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         9.43%     329.378us        77.87%       2.720ms       2.720ms       0.000us         0.00%       1.842ms       1.842ms             1  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.815ms       102.19%       1.815ms       1.815ms             1  
-                                            aten::copy_         3.09%     107.951us        52.68%       1.840ms     102.235us     794.051us        44.71%     860.068us      47.782us            18  
-                                              aten::mul         4.59%     160.365us         8.02%     279.997us      11.667us     834.368us        46.99%     834.368us      34.765us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     834.368us        46.99%     834.368us      34.765us            24  
-                                            aten::clone         0.80%      28.034us        50.14%       1.751ms     291.882us       0.000us         0.00%     627.394us     104.566us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     561.377us        31.61%     561.377us      93.563us             6  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     232.674us        13.10%     232.674us      19.389us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     147.392us         8.30%     147.392us      12.283us            12  
-                                              aten::sub         1.14%      39.970us         1.89%      66.170us      11.028us      89.952us         5.07%      89.952us      14.992us             6  
-                                Activity Buffer Request        41.31%       1.443ms        41.31%       1.443ms       1.443ms      66.017us         3.72%      66.017us      66.017us             1  
-                                              aten::add         0.95%      33.281us         1.61%      56.271us       9.379us      57.440us         3.23%      57.440us       9.573us             6  
-                                    aten::empty_strided         0.85%      29.670us         0.85%      29.670us       4.945us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         6.22%     217.146us         6.22%     217.146us      36.191us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.01%      70.292us         2.58%      90.182us       3.758us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.57%      19.890us         0.57%      19.890us       0.829us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         6.90%     240.975us         6.90%     240.975us       5.020us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize        22.13%     773.090us        22.13%     773.090us     773.090us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager        12.10%     272.127us        61.47%       1.382ms       1.382ms       0.000us         0.00%       1.837ms       1.837ms             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.810ms       102.21%       1.810ms       1.810ms             1  
+                                            aten::copy_         4.74%     106.692us        27.02%     607.756us      33.764us     794.110us        44.84%     859.966us      47.776us            18  
+                                              aten::mul         6.35%     142.895us        11.18%     251.386us      10.474us     829.085us        46.82%     829.085us      34.545us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     829.085us        46.82%     829.085us      34.545us            24  
+                                            aten::clone         0.94%      21.099us        23.42%     526.743us      87.790us       0.000us         0.00%     627.678us     104.613us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     561.822us        31.73%     561.822us      93.637us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     232.288us        13.12%     232.288us      19.357us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     147.650us         8.34%     147.650us      12.304us            12  
+                                              aten::sub         1.58%      35.541us         2.61%      58.661us       9.777us      89.538us         5.06%      89.538us      14.923us             6  
+                                Activity Buffer Request         9.29%     208.845us         9.29%     208.845us     208.845us      65.856us         3.72%      65.856us      65.856us             1  
+                                              aten::add         1.43%      32.251us         2.42%      54.461us       9.077us      58.112us         3.28%      58.112us       9.685us             6  
+                                    aten::empty_strided         1.39%      31.342us         1.39%      31.342us       5.224us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        10.27%     230.957us        10.27%     230.957us      38.493us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.99%      67.270us         3.80%      85.550us       3.565us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.81%      18.280us         0.81%      18.280us       0.762us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         9.56%     215.083us         9.56%     215.083us       4.481us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize        38.53%     866.589us        38.53%     866.589us     866.589us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 3.493ms
-Self CUDA time total: 1.776ms
+Self CPU time total: 2.249ms
+Self CUDA time total: 1.771ms
 
 
 impl                     wl                  p50(ms)  ok
-torch_eager              cuda_B1_S128_H32_D128_R64     0.22  True
+torch_eager              cuda_B1_S128_H32_D128_R64     0.21  True
 torch_eager              cuda_B1_S128_H32_D64_R32     0.22  True
-torch_eager              cuda_B1_S128_H8_D128_R64     0.23  True
-torch_eager              cuda_B1_S128_H8_D64_R32     0.18  True
-torch_eager              cuda_B1_S2048_H32_D128_R64     0.23  True
-torch_eager              cuda_B1_S2048_H32_D64_R32     0.22  True
-torch_eager              cuda_B1_S2048_H8_D128_R64     0.22  True
-torch_eager              cuda_B1_S2048_H8_D64_R32     0.22  True
-torch_eager              cuda_B1_S512_H32_D128_R64     0.22  True
-torch_eager              cuda_B1_S512_H32_D64_R32     0.22  True
-torch_eager              cuda_B1_S512_H8_D128_R64     0.22  True
+torch_eager              cuda_B1_S128_H8_D128_R64     0.22  True
+torch_eager              cuda_B1_S128_H8_D64_R32     0.17  True
+torch_eager              cuda_B1_S2048_H32_D128_R64     0.22  True
+torch_eager              cuda_B1_S2048_H32_D64_R32     0.21  True
+torch_eager              cuda_B1_S2048_H8_D128_R64     0.21  True
+torch_eager              cuda_B1_S2048_H8_D64_R32     0.21  True
+torch_eager              cuda_B1_S512_H32_D128_R64     0.21  True
+torch_eager              cuda_B1_S512_H32_D64_R32     0.21  True
+torch_eager              cuda_B1_S512_H8_D128_R64     0.21  True
 torch_eager              cuda_B1_S512_H8_D64_R32     0.22  True
-torch_eager              cuda_B2_S128_H32_D128_R64     0.22  True
-torch_eager              cuda_B2_S128_H32_D64_R32     0.22  True
-torch_eager              cuda_B2_S128_H8_D128_R64     0.22  True
-torch_eager              cuda_B2_S128_H8_D64_R32     0.22  True
+torch_eager              cuda_B2_S128_H32_D128_R64     0.21  True
+torch_eager              cuda_B2_S128_H32_D64_R32     0.21  True
+torch_eager              cuda_B2_S128_H8_D128_R64     0.21  True
+torch_eager              cuda_B2_S128_H8_D64_R32     0.21  True
 torch_eager              cuda_B2_S2048_H32_D128_R64     0.64  True
 torch_eager              cuda_B2_S2048_H32_D64_R32     0.23  True
 torch_eager              cuda_B2_S2048_H8_D128_R64     0.22  True
-torch_eager              cuda_B2_S2048_H8_D64_R32     0.22  True
-torch_eager              cuda_B2_S512_H32_D128_R64     0.22  True
-torch_eager              cuda_B2_S512_H32_D64_R32     0.22  True
+torch_eager              cuda_B2_S2048_H8_D64_R32     0.21  True
+torch_eager              cuda_B2_S512_H32_D128_R64     0.21  True
+torch_eager              cuda_B2_S512_H32_D64_R32     0.21  True
 torch_eager              cuda_B2_S512_H8_D128_R64     0.21  True
-torch_eager              cuda_B2_S512_H8_D64_R32     0.22  True
+torch_eager              cuda_B2_S512_H8_D64_R32     0.21  True
 
+
+
▶ UV Install Logs
+ +

Artifacts:

rotary.jsonl diff --git a/rotary/index.html b/rotary/index.html index 5ff503336b04c290f15ed24958b96a45568efad3..7b41f23988a4a73d3b10140441b1b9a5b33195e3 100644 --- a/rotary/index.html +++ b/rotary/index.html @@ -809,6 +809,14 @@ .artifact-preview svg { background: transparent; } + /* Invert SVG images in dark mode */ + :root[data-theme="dark"] .artifact-preview img[src$=".svg"] { + filter: invert(0.9) hue-rotate(180deg); + } + /* Keep SVG images readable in monocolor mode */ + :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] { + filter: none; + } /* CSV table styling */ .artifact-csv { margin-top: 1rem; diff --git a/rotary/results/artifacts/combine/latency.svg b/rotary/results/artifacts/combine/latency.svg index 3fdefb46544d73b9bc85fc2ae3e00add87b86535..826939d07c71a260b94a20f3de28c5f0a2cf3fac 100644 --- a/rotary/results/artifacts/combine/latency.svg +++ b/rotary/results/artifacts/combine/latency.svg @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:36e71e631ab1a00097df3bc72a4532b4b383ed31a1df2368bd041e765254a9c3 -size 31018 +oid sha256:1073fb7d2fda354b1fcc8c64879bfd105a9e36e75f5ab25c8a5cb53277099549 +size 37852 diff --git a/rotary/results/combined_results.html b/rotary/results/combined_results.html index 17475d0e65452d0f310ef38d60c5c80c88e6833b..ef6ae6cd3e3ada34b3c95414a045f0c92f1e3ce7 100644 --- a/rotary/results/combined_results.html +++ b/rotary/results/combined_results.html @@ -809,6 +809,14 @@ .artifact-preview svg { background: transparent; } + /* Invert SVG images in dark mode */ + :root[data-theme="dark"] .artifact-preview img[src$=".svg"] { + filter: invert(0.9) hue-rotate(180deg); + } + /* Keep SVG images readable in monocolor mode */ + :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] { + filter: none; + } /* CSV table styling */ .artifact-csv { margin-top: 1rem; @@ -3872,7 +3880,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: - 2025-10-29T14:27:54.393501 + 2025-10-29T15:51:00.751980 image/svg+xml @@ -4216,108 +4224,179 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: - + - + - 0.2 + 0.1 - + - + - 0.3 + 0.2 - + - + - 0.4 + 0.3 - + - + - 0.5 + 0.4 - + - + - 0.6 + 0.5 + + + + + + + + + + + + + 0.6 + + + + + + + + + + + + + 0.7 + + + + + + + + + + + + + 0.8 Latency P50 (ms) - - + + - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -4330,21 +4409,30 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: - + Attention Implementation Latency - + - + + + hf_kernels_rotary + + + + + + + - torch_eager + torch_eager @@ -4364,7 +4452,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: combine | 4.35s +Cell: combine | 4.43s | Raw @@ -4436,11 +4524,11 @@ Cell: combine | 4.35s
======================================================================
 LOADING BENCHMARK DATA
 ======================================================================
-✓ HF Kernels Rotary             : /__w/kernels-benchmarks/kernels-benchmarks/benches/rotary/impls/.uvnote/cache/49ec9501b131c967277abe3cccb638422565260339bb30f5ea386b0076f2183e
+✓ HF Kernels Rotary             : /__w/kernels-benchmarks/kernels-benchmarks/benches/rotary/impls/.uvnote/cache/3884170bda871392d403d55c822a8b7de8970f81c4733ae7630938c3bf0db88a
 ✓ PyTorch Rotary                : /__w/kernels-benchmarks/kernels-benchmarks/benches/rotary/impls/.uvnote/cache/abf801d6445dfa81a8dd7b2e6257930c39c18160a9b97a739858c3b244e16cc5
 
   ✓ Found HF Kernels Rotary
-     Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/rotary/impls/.uvnote/cache/49ec9501b131c967277abe3cccb638422565260339bb30f5ea386b0076f2183e/rotary.jsonl
+     Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/rotary/impls/.uvnote/cache/3884170bda871392d403d55c822a8b7de8970f81c4733ae7630938c3bf0db88a/rotary.jsonl
   ✓ Found PyTorch Rotary
      Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/rotary/impls/.uvnote/cache/abf801d6445dfa81a8dd7b2e6257930c39c18160a9b97a739858c3b244e16cc5/rotary.jsonl
 
@@ -4451,54 +4539,54 @@ Summary: 2 found, 0 skipped, 0 missing
 COMBINED BENCHMARK SUMMARY
 
 impl                     wl                  p50(ms)  ok
-hf_kernels_rotary        cuda_B1_S128_H32_D128_R64     0.09  False
-hf_kernels_rotary        cuda_B1_S128_H32_D64_R32     0.09  False
-hf_kernels_rotary        cuda_B1_S128_H8_D128_R64     0.10  False
-hf_kernels_rotary        cuda_B1_S128_H8_D64_R32     0.08  False
-hf_kernels_rotary        cuda_B1_S2048_H32_D128_R64     0.09  False
-hf_kernels_rotary        cuda_B1_S2048_H32_D64_R32     0.09  False
-hf_kernels_rotary        cuda_B1_S2048_H8_D128_R64     0.09  False
-hf_kernels_rotary        cuda_B1_S2048_H8_D64_R32     0.09  False
-hf_kernels_rotary        cuda_B1_S512_H32_D128_R64     0.09  False
-hf_kernels_rotary        cuda_B1_S512_H32_D64_R32     0.09  False
-hf_kernels_rotary        cuda_B1_S512_H8_D128_R64     0.09  False
-hf_kernels_rotary        cuda_B1_S512_H8_D64_R32     0.09  False
-hf_kernels_rotary        cuda_B2_S128_H32_D128_R64     0.09  False
-hf_kernels_rotary        cuda_B2_S128_H32_D64_R32     0.09  False
-hf_kernels_rotary        cuda_B2_S128_H8_D128_R64     0.09  False
-hf_kernels_rotary        cuda_B2_S128_H8_D64_R32     0.09  False
-hf_kernels_rotary        cuda_B2_S2048_H32_D128_R64     0.28  False
-hf_kernels_rotary        cuda_B2_S2048_H32_D64_R32     0.10  False
-hf_kernels_rotary        cuda_B2_S2048_H8_D128_R64     0.09  False
-hf_kernels_rotary        cuda_B2_S2048_H8_D64_R32     0.09  False
-hf_kernels_rotary        cuda_B2_S512_H32_D128_R64     0.09  False
-hf_kernels_rotary        cuda_B2_S512_H32_D64_R32     0.09  False
-hf_kernels_rotary        cuda_B2_S512_H8_D128_R64     0.09  False
-hf_kernels_rotary        cuda_B2_S512_H8_D64_R32     0.09  False
-torch_eager              cuda_B1_S128_H32_D128_R64     0.22  True
+hf_kernels_rotary        cuda_B1_S128_H32_D128_R64     0.09  True
+hf_kernels_rotary        cuda_B1_S128_H32_D64_R32     0.09  True
+hf_kernels_rotary        cuda_B1_S128_H8_D128_R64     0.09  True
+hf_kernels_rotary        cuda_B1_S128_H8_D64_R32     0.07  True
+hf_kernels_rotary        cuda_B1_S2048_H32_D128_R64     0.26  True
+hf_kernels_rotary        cuda_B1_S2048_H32_D64_R32     0.09  True
+hf_kernels_rotary        cuda_B1_S2048_H8_D128_R64     0.09  True
+hf_kernels_rotary        cuda_B1_S2048_H8_D64_R32     0.09  True
+hf_kernels_rotary        cuda_B1_S512_H32_D128_R64     0.09  True
+hf_kernels_rotary        cuda_B1_S512_H32_D64_R32     0.09  True
+hf_kernels_rotary        cuda_B1_S512_H8_D128_R64     0.09  True
+hf_kernels_rotary        cuda_B1_S512_H8_D64_R32     0.09  True
+hf_kernels_rotary        cuda_B2_S128_H32_D128_R64     0.09  True
+hf_kernels_rotary        cuda_B2_S128_H32_D64_R32     0.09  True
+hf_kernels_rotary        cuda_B2_S128_H8_D128_R64     0.09  True
+hf_kernels_rotary        cuda_B2_S128_H8_D64_R32     0.09  True
+hf_kernels_rotary        cuda_B2_S2048_H32_D128_R64     0.84  True
+hf_kernels_rotary        cuda_B2_S2048_H32_D64_R32     0.26  True
+hf_kernels_rotary        cuda_B2_S2048_H8_D128_R64     0.09  True
+hf_kernels_rotary        cuda_B2_S2048_H8_D64_R32     0.09  True
+hf_kernels_rotary        cuda_B2_S512_H32_D128_R64     0.09  True
+hf_kernels_rotary        cuda_B2_S512_H32_D64_R32     0.09  True
+hf_kernels_rotary        cuda_B2_S512_H8_D128_R64     0.09  True
+hf_kernels_rotary        cuda_B2_S512_H8_D64_R32     0.09  True
+torch_eager              cuda_B1_S128_H32_D128_R64     0.21  True
 torch_eager              cuda_B1_S128_H32_D64_R32     0.22  True
-torch_eager              cuda_B1_S128_H8_D128_R64     0.23  True
-torch_eager              cuda_B1_S128_H8_D64_R32     0.18  True
-torch_eager              cuda_B1_S2048_H32_D128_R64     0.23  True
-torch_eager              cuda_B1_S2048_H32_D64_R32     0.22  True
-torch_eager              cuda_B1_S2048_H8_D128_R64     0.22  True
-torch_eager              cuda_B1_S2048_H8_D64_R32     0.22  True
-torch_eager              cuda_B1_S512_H32_D128_R64     0.22  True
-torch_eager              cuda_B1_S512_H32_D64_R32     0.22  True
-torch_eager              cuda_B1_S512_H8_D128_R64     0.22  True
+torch_eager              cuda_B1_S128_H8_D128_R64     0.22  True
+torch_eager              cuda_B1_S128_H8_D64_R32     0.17  True
+torch_eager              cuda_B1_S2048_H32_D128_R64     0.22  True
+torch_eager              cuda_B1_S2048_H32_D64_R32     0.21  True
+torch_eager              cuda_B1_S2048_H8_D128_R64     0.21  True
+torch_eager              cuda_B1_S2048_H8_D64_R32     0.21  True
+torch_eager              cuda_B1_S512_H32_D128_R64     0.21  True
+torch_eager              cuda_B1_S512_H32_D64_R32     0.21  True
+torch_eager              cuda_B1_S512_H8_D128_R64     0.21  True
 torch_eager              cuda_B1_S512_H8_D64_R32     0.22  True
-torch_eager              cuda_B2_S128_H32_D128_R64     0.22  True
-torch_eager              cuda_B2_S128_H32_D64_R32     0.22  True
-torch_eager              cuda_B2_S128_H8_D128_R64     0.22  True
-torch_eager              cuda_B2_S128_H8_D64_R32     0.22  True
+torch_eager              cuda_B2_S128_H32_D128_R64     0.21  True
+torch_eager              cuda_B2_S128_H32_D64_R32     0.21  True
+torch_eager              cuda_B2_S128_H8_D128_R64     0.21  True
+torch_eager              cuda_B2_S128_H8_D64_R32     0.21  True
 torch_eager              cuda_B2_S2048_H32_D128_R64     0.64  True
 torch_eager              cuda_B2_S2048_H32_D64_R32     0.23  True
 torch_eager              cuda_B2_S2048_H8_D128_R64     0.22  True
-torch_eager              cuda_B2_S2048_H8_D64_R32     0.22  True
-torch_eager              cuda_B2_S512_H32_D128_R64     0.22  True
-torch_eager              cuda_B2_S512_H32_D64_R32     0.22  True
+torch_eager              cuda_B2_S2048_H8_D64_R32     0.21  True
+torch_eager              cuda_B2_S512_H32_D128_R64     0.21  True
+torch_eager              cuda_B2_S512_H32_D64_R32     0.21  True
 torch_eager              cuda_B2_S512_H8_D128_R64     0.21  True
-torch_eager              cuda_B2_S512_H8_D64_R32     0.22  True
+torch_eager              cuda_B2_S512_H8_D64_R32     0.21  True
 
 GENERATING COMBINED VISUALIZATION
 
@@ -4518,7 +4606,7 @@ Implementations included:
 
▶ UV Install Logs
@@ -4531,7 +4619,7 @@ Installed 37 packages in 239ms - 2025-10-29T14:27:54.393501 + 2025-10-29T15:51:00.751980 image/svg+xml @@ -4875,108 +4963,179 @@ Installed 37 packages in 239ms - + - + - 0.2 + 0.1 - + - + - 0.3 + 0.2 - + - + - 0.4 + 0.3 - + - + - 0.5 + 0.4 - + - + - 0.6 + 0.5 + + + + + + + + + + + + + 0.6 + + + + + + + + + + + + + 0.7 + + + + + + + + + + + + + 0.8 Latency P50 (ms) - - + + - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -4989,21 +5148,30 @@ Installed 37 packages in 239ms - + Attention Implementation Latency - + - + + + hf_kernels_rotary + + + + + + + - torch_eager + torch_eager