diff --git a/activation/impls/artifacts/benchmark/activation.jsonl b/activation/impls/artifacts/benchmark/activation.jsonl index 7d1a3c651f4f5824ec6a49a44a98836bdbadbb74..e014dc32b9e6c116e3ee3407f5e64ef8451eee67 100644 --- a/activation/impls/artifacts/benchmark/activation.jsonl +++ b/activation/impls/artifacts/benchmark/activation.jsonl @@ -1,9 +1,9 @@ -{"ts": "2025-10-30T15:52:35Z", "run": "37230f1ce31641f2b6ebd7aba7f793c9", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.022969999974975508, "p50": 0.023499999997511622, "p90": 0.023961000010785938, "mean": 0.02361460000201987, "iqr": 0.0009899999895424116, "raw_times": [0.022971000021243526, 0.022969999974975508, 0.023961000010785938, 0.023499999997511622, 0.024671000005582755], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03073999999969601, "peak_bytes": 1966080, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} -{"ts": "2025-10-30T15:52:35Z", "run": "37230f1ce31641f2b6ebd7aba7f793c9", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.027540000019143918, "p50": 0.029130999962490023, "p90": 0.03002100004323438, "mean": 0.029014800009008468, "iqr": 0.0016900000332498166, "raw_times": [0.027540000019143918, 0.030051000010189455, 0.03002100004323438, 0.029130999962490023, 0.028331000009984564], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03343999998151048, "peak_bytes": 2621440, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} -{"ts": "2025-10-30T15:52:35Z", "run": "37230f1ce31641f2b6ebd7aba7f793c9", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02829999999676147, "p50": 0.029119999965132592, "p90": 0.03051000004461457, "mean": 0.029939999990347133, "iqr": 0.0019500000689731678, "raw_times": [0.02829999999676147, 0.03051000004461457, 0.033209999969585624, 0.029119999965132592, 0.028559999975641404], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031761000002461515, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} -{"ts": "2025-10-30T15:52:35Z", "run": "37230f1ce31641f2b6ebd7aba7f793c9", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.027531000000635686, "p50": 0.028170999996746104, "p90": 0.028501000031155854, "mean": 0.028293000002577173, "iqr": 0.0008900000239009387, "raw_times": [0.027611000007254916, 0.028170999996746104, 0.029650999977093306, 0.027531000000635686, 0.028501000031155854], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03130000004603062, "peak_bytes": 3932160, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} -{"ts": "2025-10-30T15:52:35Z", "run": "37230f1ce31641f2b6ebd7aba7f793c9", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02755000002707675, "p50": 0.02861000001530556, "p90": 0.028831000008722185, "mean": 0.02867660001584227, "iqr": 0.00023000001192485797, "raw_times": [0.028600999996797327, 0.029791000031309522, 0.028831000008722185, 0.02755000002707675, 0.02861000001530556], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03139000000373926, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} -{"ts": "2025-10-30T15:52:35Z", "run": "37230f1ce31641f2b6ebd7aba7f793c9", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02618100000972845, "p50": 0.027131000024382956, "p90": 0.02731099999664366, "mean": 0.026918799994746223, "iqr": 0.0007610000238855719, "raw_times": [0.02618100000972845, 0.027131000024382956, 0.027420999970217963, 0.02731099999664366, 0.026549999972758087], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03008099997714453, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} -{"ts": "2025-10-30T15:52:35Z", "run": "37230f1ce31641f2b6ebd7aba7f793c9", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.026950999995278835, "p50": 0.02748000002839035, "p90": 0.02804100000730614, "mean": 0.02758480000011332, "iqr": 0.0006300000450210064, "raw_times": [0.026950999995278835, 0.02804100000730614, 0.027410999962285132, 0.02804100000730614, 0.02748000002839035], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03104999996139668, "peak_bytes": 7864320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} -{"ts": "2025-10-30T15:52:35Z", "run": "37230f1ce31641f2b6ebd7aba7f793c9", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.026300000001810986, "p50": 0.02733100001250932, "p90": 0.0275399999623005, "mean": 0.02720039998393986, "iqr": 0.0004789999934473599, "raw_times": [0.02706099996885314, 0.02733100001250932, 0.027769999974225357, 0.0275399999623005, 0.026300000001810986], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03032000000757762, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} -{"ts": "2025-10-30T15:52:35Z", "run": "37230f1ce31641f2b6ebd7aba7f793c9", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02642100002958614, "p50": 0.027860999978202017, "p90": 0.02790100000993334, "mean": 0.027615000010428048, "iqr": 0.00036000000136482413, "raw_times": [0.02642100002958614, 0.028351000025850226, 0.027541000008568517, 0.02790100000993334, 0.027860999978202017], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03163999997468636, "peak_bytes": 20971520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-31T20:01:08Z", "run": "c3623842075144ab92176d6468514bae", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.024230000008174102, "p50": 0.024741000004269154, "p90": 0.025410999967334646, "mean": 0.024872599999525846, "iqr": 0.0011599999538702832, "raw_times": [0.024251000013464363, 0.025730000004386966, 0.024230000008174102, 0.025410999967334646, 0.024741000004269154], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03134100001034312, "peak_bytes": 1966080, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-31T20:01:08Z", "run": "c3623842075144ab92176d6468514bae", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.026611000009779673, "p50": 0.029731000040555955, "p90": 0.03027100001418148, "mean": 0.029349000021738902, "iqr": 0.0009999999974752427, "raw_times": [0.026611000009779673, 0.029731000040555955, 0.030861000027471164, 0.03027100001418148, 0.02927100001670624], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.034871000025304966, "peak_bytes": 2621440, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-31T20:01:08Z", "run": "c3623842075144ab92176d6468514bae", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.027259999967554904, "p50": 0.02879100003383428, "p90": 0.030951000042023225, "mean": 0.029224800016436348, "iqr": 0.0029600000175378227, "raw_times": [0.027991000024485402, 0.031131000014283927, 0.02879100003383428, 0.030951000042023225, 0.027259999967554904], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 0.0323909999906391, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-31T20:01:08Z", "run": "c3623842075144ab92176d6468514bae", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.025391000008312403, "p50": 0.02888100004838634, "p90": 0.029160999986288516, "mean": 0.028055000007043418, "iqr": 0.001839999981712026, "raw_times": [0.025391000008312403, 0.02888100004838634, 0.02952099998765334, 0.029160999986288516, 0.02732100000457649], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031509999985246395, "peak_bytes": 3932160, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-31T20:01:08Z", "run": "c3623842075144ab92176d6468514bae", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.026630000036220736, "p50": 0.027450000004591857, "p90": 0.027921000025799003, "mean": 0.02735460001304091, "iqr": 0.0010800000040944724, "raw_times": [0.026630000036220736, 0.027450000004591857, 0.02684100002170453, 0.027921000025799003, 0.027930999976888415], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03172099997073019, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-31T20:01:08Z", "run": "c3623842075144ab92176d6468514bae", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.025049999976545223, "p50": 0.02733100001250932, "p90": 0.028329999963716546, "mean": 0.02741439998317219, "iqr": 0.0016189999882953998, "raw_times": [0.025049999976545223, 0.029649999987668707, 0.028329999963716546, 0.02733100001250932, 0.026710999975421146], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.029950999987704563, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-31T20:01:08Z", "run": "c3623842075144ab92176d6468514bae", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.028341000017917395, "p50": 0.02927099995986282, "p90": 0.029501000028631097, "mean": 0.02909080000108588, "iqr": 0.0009110000291912002, "raw_times": [0.028341000017917395, 0.02927099995986282, 0.029501000028631097, 0.029750999999578198, 0.028589999999439897], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03009099998507736, "peak_bytes": 7864320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-31T20:01:08Z", "run": "c3623842075144ab92176d6468514bae", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.024770999971224228, "p50": 0.02814099997294761, "p90": 0.028720999978304462, "mean": 0.0278467999919485, "iqr": 0.0007409999511764909, "raw_times": [0.024770999971224228, 0.02798000002712797, 0.028720999978304462, 0.02814099997294761, 0.029621000010138232], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031990999957542954, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-31T20:01:08Z", "run": "c3623842075144ab92176d6468514bae", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.027751000004627713, "p50": 0.028230999987499672, "p90": 0.029471000004832604, "mean": 0.028608800005258672, "iqr": 0.0016500000015184924, "raw_times": [0.028230999987499672, 0.027751000004627713, 0.02782100000331411, 0.02977000002601926, 0.029471000004832604], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.030850999962694914, "peak_bytes": 20971520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} diff --git a/activation/impls/hf_kernels_swiglu.html b/activation/impls/hf_kernels_swiglu.html index cb9b44ffb1c09312b21e7b7e432e9d78fbf6e49d..812f027418b96fc5dd3cda564134f577079c3349 100644 --- a/activation/impls/hf_kernels_swiglu.html +++ b/activation/impls/hf_kernels_swiglu.html @@ -4106,11 +4106,12 @@ body[data-tool="eraser"] .main-content { ▼ output ▶ uv-logs | -Cell: nv | 0.23s +Cell: nv | 0.26s | Raw -GitHub +GitHub +🤗 HF
@@ -4122,7 +4123,7 @@ Cell: nv | 0.23s
-
Thu Oct 30 15:52:16 2025       
+
Fri Oct 31 20:00:17 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -4131,7 +4132,7 @@ Cell: nv | 0.23s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   29C    P0             86W /  350W |       0MiB /  46068MiB |      0%      Default |
+| N/A   33C    P0            108W /  350W |       0MiB /  46068MiB |     88%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -4155,11 +4156,12 @@ Cell: nv | 0.23s
 ▼ output
  ▶ uv-logs
  | 
-Cell: benchmark | 4.17s
+Cell: benchmark | 4.19s
  | 
 
 Raw
-GitHub
+GitHub
+🤗 HF
 
@@ -4211,17 +4213,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D768 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 78.752us 1953.17% 78.752us 78.752us 1 - hf_kernels_swiglu 9.29% 160.875us 99.59% 1.725ms 1.725ms 0.000us 0.00% 5.440us 5.440us 1 - _activation_beeaae6::silu_and_mul 1.15% 19.839us 87.61% 1.518ms 505.995us 4.032us 100.00% 5.440us 1.813us 3 -void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.032us 100.00% 4.032us 1.344us 3 - Activity Buffer Request 83.97% 1.455ms 83.97% 1.455ms 1.455ms 1.408us 34.92% 1.408us 1.408us 1 - aten::empty 2.69% 46.600us 2.69% 46.600us 15.533us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 2.49% 43.201us 2.49% 43.201us 14.400us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.41% 7.161us 0.41% 7.161us 7.161us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 105.055us 2585.65% 105.055us 105.055us 1 + hf_kernels_swiglu 11.41% 202.714us 99.64% 1.770ms 1.770ms 0.000us 0.00% 5.471us 5.471us 1 + _activation_beeaae6::silu_and_mul 1.18% 21.050us 84.47% 1.501ms 500.190us 4.063us 100.00% 5.471us 1.824us 3 +void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.063us 100.00% 4.063us 1.354us 3 + Activity Buffer Request 80.70% 1.434ms 80.70% 1.434ms 1.434ms 1.408us 34.65% 1.408us 1.408us 1 + aten::empty 3.76% 66.772us 3.76% 66.772us 22.257us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 2.58% 45.872us 2.58% 45.872us 15.291us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.36% 6.420us 0.36% 6.420us 6.420us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.733ms -Self CUDA time total: 4.032us +Self CPU time total: 1.776ms +Self CUDA time total: 4.063us @@ -4231,17 +4233,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D1024 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 62.528us 1575.81% 62.528us 62.528us 1 - hf_kernels_swiglu 6.86% 110.833us 99.69% 1.610ms 1.610ms 0.000us 0.00% 5.312us 5.312us 1 - _activation_beeaae6::silu_and_mul 1.31% 21.159us 91.69% 1.481ms 493.565us 3.968us 100.00% 5.312us 1.771us 3 -void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 3.968us 100.00% 3.968us 1.323us 3 - Activity Buffer Request 88.77% 1.434ms 88.77% 1.434ms 1.434ms 1.344us 33.87% 1.344us 1.344us 1 - aten::empty 1.14% 18.330us 1.14% 18.330us 6.110us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 1.61% 26.001us 1.61% 26.001us 8.667us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.31% 5.030us 0.31% 5.030us 5.030us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 61.119us 1540.69% 61.119us 61.119us 1 + hf_kernels_swiglu 6.50% 104.811us 99.67% 1.607ms 1.607ms 0.000us 0.00% 5.279us 5.279us 1 + _activation_beeaae6::silu_and_mul 1.26% 20.331us 91.95% 1.482ms 494.073us 3.967us 100.00% 5.279us 1.760us 3 +void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 3.967us 100.00% 3.967us 1.322us 3 + Activity Buffer Request 89.13% 1.437ms 89.13% 1.437ms 1.437ms 1.312us 33.07% 1.312us 1.312us 1 + aten::empty 1.22% 19.632us 1.22% 19.632us 6.544us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 1.56% 25.120us 1.56% 25.120us 8.373us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.33% 5.360us 0.33% 5.360us 5.360us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.615ms -Self CUDA time total: 3.968us +Self CPU time total: 1.612ms +Self CUDA time total: 3.967us @@ -4251,17 +4253,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D2048 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 63.232us 1291.50% 63.232us 63.232us 1 - hf_kernels_swiglu 6.20% 101.121us 99.70% 1.627ms 1.627ms 0.000us 0.00% 6.528us 6.528us 1 - _activation_beeaae6::silu_and_mul 1.27% 20.780us 92.37% 1.507ms 502.489us 4.896us 100.00% 6.528us 2.176us 3 -void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.896us 100.00% 4.896us 1.632us 3 - Activity Buffer Request 89.54% 1.461ms 89.54% 1.461ms 1.461ms 1.632us 33.33% 1.632us 1.632us 1 - aten::empty 1.13% 18.440us 1.13% 18.440us 6.147us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 1.56% 25.391us 1.56% 25.391us 8.464us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.30% 4.970us 0.30% 4.970us 4.970us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 63.488us 1288.31% 63.488us 63.488us 1 + hf_kernels_swiglu 6.89% 111.363us 99.67% 1.611ms 1.611ms 0.000us 0.00% 6.592us 6.592us 1 + _activation_beeaae6::silu_and_mul 1.36% 22.028us 91.47% 1.479ms 492.912us 4.928us 100.00% 6.592us 2.197us 3 +void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.928us 100.00% 4.928us 1.643us 3 + Activity Buffer Request 88.52% 1.431ms 88.52% 1.431ms 1.431ms 1.664us 33.77% 1.664us 1.664us 1 + aten::empty 1.30% 21.081us 1.30% 21.081us 7.027us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 1.59% 25.652us 1.59% 25.652us 8.551us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.33% 5.390us 0.33% 5.390us 5.390us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.632ms -Self CUDA time total: 4.896us +Self CPU time total: 1.617ms +Self CUDA time total: 4.928us @@ -4271,17 +4273,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D768 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 65.664us 1554.55% 65.664us 65.664us 1 - hf_kernels_swiglu 5.63% 101.442us 99.74% 1.798ms 1.798ms 0.000us 0.00% 5.632us 5.632us 1 - _activation_beeaae6::silu_and_mul 1.18% 21.341us 92.99% 1.677ms 558.850us 4.224us 100.00% 5.632us 1.877us 3 -void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.224us 100.00% 4.224us 1.408us 3 - Activity Buffer Request 79.26% 1.429ms 79.26% 1.429ms 1.429ms 1.408us 33.33% 1.408us 1.408us 1 - aten::empty 1.12% 20.239us 1.12% 20.239us 6.746us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 12.54% 226.164us 12.54% 226.164us 75.388us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.26% 4.649us 0.26% 4.649us 4.649us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 68.000us 1585.82% 68.000us 68.000us 1 + hf_kernels_swiglu 5.97% 106.915us 99.70% 1.784ms 1.784ms 0.000us 0.00% 5.760us 5.760us 1 + _activation_beeaae6::silu_and_mul 1.16% 20.770us 92.62% 1.658ms 552.564us 4.288us 100.00% 5.760us 1.920us 3 +void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.288us 100.00% 4.288us 1.429us 3 + Activity Buffer Request 80.58% 1.442ms 80.58% 1.442ms 1.442ms 1.472us 34.33% 1.472us 1.472us 1 + aten::empty 1.10% 19.770us 1.10% 19.770us 6.590us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 10.88% 194.785us 10.88% 194.785us 64.928us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.30% 5.350us 0.30% 5.350us 5.350us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.803ms -Self CUDA time total: 4.224us +Self CPU time total: 1.790ms +Self CUDA time total: 4.288us @@ -4291,17 +4293,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D1024 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 63.968us 1086.23% 63.968us 63.968us 1 - hf_kernels_swiglu 19.44% 85.062us 98.79% 432.257us 432.257us 0.000us 0.00% 7.874us 7.874us 1 - _activation_beeaae6::silu_and_mul 4.74% 20.731us 74.99% 328.126us 109.375us 5.889us 100.00% 7.874us 2.625us 3 -void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 5.889us 100.00% 5.889us 1.963us 3 - Activity Buffer Request 29.32% 128.302us 29.32% 128.302us 128.302us 1.985us 33.71% 1.985us 1.985us 1 - aten::empty 4.36% 19.069us 4.36% 19.069us 6.356us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 40.93% 179.093us 40.93% 179.093us 59.698us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 1.21% 5.289us 1.21% 5.289us 5.289us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 65.599us 1108.28% 65.599us 65.599us 1 + hf_kernels_swiglu 18.75% 89.073us 98.88% 469.813us 469.813us 0.000us 0.00% 7.903us 7.903us 1 + _activation_beeaae6::silu_and_mul 4.69% 22.280us 76.20% 362.069us 120.690us 5.919us 100.00% 7.903us 2.634us 3 +void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 5.919us 100.00% 5.919us 1.973us 3 + Activity Buffer Request 38.23% 181.645us 38.23% 181.645us 181.645us 1.984us 33.52% 1.984us 1.984us 1 + aten::empty 3.93% 18.671us 3.93% 18.671us 6.224us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 33.28% 158.144us 33.28% 158.144us 52.715us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 1.12% 5.330us 1.12% 5.330us 5.330us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 437.546us -Self CUDA time total: 5.889us +Self CPU time total: 475.143us +Self CUDA time total: 5.919us @@ -4311,17 +4313,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D2048 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 67.167us 867.45% 67.167us 67.167us 1 - hf_kernels_swiglu 5.97% 103.951us 99.66% 1.736ms 1.736ms 0.000us 0.00% 10.335us 10.335us 1 - _activation_beeaae6::silu_and_mul 1.17% 20.451us 92.57% 1.612ms 537.363us 7.743us 100.00% 10.335us 3.445us 3 -void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 7.743us 100.00% 7.743us 2.581us 3 - Activity Buffer Request 82.03% 1.429ms 82.03% 1.429ms 1.429ms 2.592us 33.48% 2.592us 2.592us 1 - aten::empty 1.12% 19.510us 1.12% 19.510us 6.503us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 9.36% 162.983us 9.36% 162.983us 54.328us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.34% 5.970us 0.34% 5.970us 5.970us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 70.207us 906.60% 70.207us 70.207us 1 + hf_kernels_swiglu 6.12% 106.261us 99.74% 1.733ms 1.733ms 0.000us 0.00% 10.336us 10.336us 1 + _activation_beeaae6::silu_and_mul 1.25% 21.782us 92.41% 1.606ms 535.254us 7.744us 100.00% 10.336us 3.445us 3 +void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 7.744us 100.00% 7.744us 2.581us 3 + Activity Buffer Request 82.36% 1.431ms 82.36% 1.431ms 1.431ms 2.592us 33.47% 2.592us 2.592us 1 + aten::empty 1.21% 21.081us 1.21% 21.081us 7.027us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 8.80% 152.893us 8.80% 152.893us 50.964us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.26% 4.511us 0.26% 4.511us 4.511us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.742ms -Self CUDA time total: 7.743us +Self CPU time total: 1.738ms +Self CUDA time total: 7.744us @@ -4331,17 +4333,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D768 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 67.999us 1036.41% 67.999us 67.999us 1 - hf_kernels_swiglu 5.88% 101.172us 99.74% 1.716ms 1.716ms 0.000us 0.00% 8.769us 8.769us 1 - _activation_beeaae6::silu_and_mul 1.20% 20.670us 92.73% 1.596ms 531.873us 6.561us 100.00% 8.769us 2.923us 3 -void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 6.561us 100.00% 6.561us 2.187us 3 - Activity Buffer Request 82.56% 1.421ms 82.56% 1.421ms 1.421ms 2.208us 33.65% 2.208us 2.208us 1 - aten::empty 1.13% 19.490us 1.13% 19.490us 6.497us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 8.96% 154.233us 8.96% 154.233us 51.411us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.26% 4.490us 0.26% 4.490us 4.490us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 69.214us 1045.06% 69.214us 69.214us 1 + hf_kernels_swiglu 7.00% 122.783us 99.73% 1.750ms 1.750ms 0.000us 0.00% 8.830us 8.830us 1 + _activation_beeaae6::silu_and_mul 1.22% 21.430us 91.58% 1.607ms 535.694us 6.623us 100.00% 8.830us 2.943us 3 +void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 6.623us 100.00% 6.623us 2.208us 3 + Activity Buffer Request 81.74% 1.434ms 81.74% 1.434ms 1.434ms 2.207us 33.32% 2.207us 2.207us 1 + aten::empty 1.15% 20.211us 1.15% 20.211us 6.737us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 8.62% 151.304us 8.62% 151.304us 50.435us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.27% 4.780us 0.27% 4.780us 4.780us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.721ms -Self CUDA time total: 6.561us +Self CPU time total: 1.755ms +Self CUDA time total: 6.623us @@ -4351,17 +4353,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D1024 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 63.295us 670.43% 63.295us 63.295us 1 - hf_kernels_swiglu 23.24% 86.211us 98.67% 366.026us 366.026us 0.000us 0.00% 12.609us 12.609us 1 - _activation_beeaae6::silu_and_mul 5.71% 21.191us 70.40% 261.155us 87.052us 9.441us 100.00% 12.609us 4.203us 3 -void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 9.441us 100.00% 9.441us 3.147us 3 - Activity Buffer Request 23.85% 88.481us 23.85% 88.481us 88.481us 3.168us 33.56% 3.168us 3.168us 1 - aten::empty 5.03% 18.660us 5.03% 18.660us 6.220us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 40.84% 151.483us 40.84% 151.483us 50.494us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 1.33% 4.920us 1.33% 4.920us 4.920us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 65.152us 692.52% 65.152us 65.152us 1 + hf_kernels_swiglu 21.62% 91.474us 98.93% 418.571us 418.571us 0.000us 0.00% 12.576us 12.576us 1 + _activation_beeaae6::silu_and_mul 4.88% 20.631us 69.03% 292.067us 97.356us 9.408us 100.00% 12.576us 4.192us 3 +void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 9.408us 100.00% 9.408us 3.136us 3 + Activity Buffer Request 28.63% 121.143us 28.63% 121.143us 121.143us 3.168us 33.67% 3.168us 3.168us 1 + aten::empty 8.28% 35.030us 8.28% 35.030us 11.677us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 35.52% 150.293us 35.52% 150.293us 50.098us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 1.07% 4.530us 1.07% 4.530us 4.530us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 370.946us -Self CUDA time total: 9.441us +Self CPU time total: 423.101us +Self CUDA time total: 9.408us @@ -4371,17 +4373,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D2048 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 65.342us 500.47% 65.342us 65.342us 1 - hf_kernels_swiglu 22.94% 96.471us 98.88% 415.727us 415.727us 0.000us 0.00% 17.408us 17.408us 1 - _activation_beeaae6::silu_and_mul 5.11% 21.490us 71.29% 299.725us 99.908us 13.056us 100.00% 17.408us 5.803us 3 -void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 13.056us 100.00% 13.056us 4.352us 3 - Activity Buffer Request 30.59% 128.632us 30.59% 128.632us 128.632us 4.352us 33.33% 4.352us 4.352us 1 - aten::empty 4.65% 19.531us 4.65% 19.531us 6.510us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 35.58% 149.603us 35.58% 149.603us 49.868us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 1.12% 4.720us 1.12% 4.720us 4.720us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 67.197us 514.72% 67.197us 67.197us 1 + hf_kernels_swiglu 22.39% 97.642us 98.93% 431.481us 431.481us 0.000us 0.00% 17.439us 17.439us 1 + _activation_beeaae6::silu_and_mul 4.99% 21.781us 71.94% 313.789us 104.596us 13.055us 100.00% 17.439us 5.813us 3 +void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 13.055us 100.00% 13.055us 4.352us 3 + Activity Buffer Request 32.48% 141.684us 32.48% 141.684us 141.684us 4.384us 33.58% 4.384us 4.384us 1 + aten::empty 4.60% 20.050us 4.60% 20.050us 6.683us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 34.47% 150.324us 34.47% 150.324us 50.108us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 1.07% 4.681us 1.07% 4.681us 4.681us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 420.447us -Self CUDA time total: 13.056us +Self CPU time total: 436.162us +Self CUDA time total: 13.055us impl wl p50(ms) ok @@ -4398,12 +4400,12 @@ hf_kernels_swiglu cuda_T512_D768 0.03 True
▶ UV Install Logs
Fetching 7 files: 0%| | 0/7 [00:00<?, ?it/s] -Fetching 7 files: 71%|███████▏ | 5/7 [00:00<00:00, 14.50it/s] -Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 20.28it/s]
+Fetching 7 files: 71%|███████▏ | 5/7 [00:00<00:00, 15.31it/s] +Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 21.41it/s]

Artifacts:

activation.jsonl diff --git a/activation/impls/torch_swiglu.html b/activation/impls/torch_swiglu.html index 215b2799716ac41798e6372ba0e150a2bd6bd9c0..41f6e46a2626019e3e97d61016b7b71b844385d6 100644 --- a/activation/impls/torch_swiglu.html +++ b/activation/impls/torch_swiglu.html @@ -4106,11 +4106,11 @@ body[data-tool="eraser"] .main-content { ▼ output ▶ uv-logs | -Cell: nv | 0.23s +Cell: nv | 0.26s | Raw -GitHub +GitHub
@@ -4122,7 +4122,7 @@ Cell: nv | 0.23s
-
Thu Oct 30 15:52:16 2025       
+
Fri Oct 31 20:00:17 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -4131,7 +4131,7 @@ Cell: nv | 0.23s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   29C    P0             86W /  350W |       0MiB /  46068MiB |      0%      Default |
+| N/A   33C    P0            108W /  350W |       0MiB /  46068MiB |     88%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -4155,11 +4155,11 @@ Cell: nv | 0.23s
 ▼ output
  ▶ uv-logs
  | 
-Cell: benchmark | 6.88s
+Cell: benchmark | 7.02s
  | 
 
 Raw
-GitHub
+GitHub
 
@@ -4205,20 +4205,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D768 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 179.327us 1411.47% 179.327us 179.327us 1 - torch_eager 11.22% 210.364us 99.57% 1.867ms 1.867ms 0.000us 0.00% 15.009us 15.009us 1 - aten::silu 3.37% 63.151us 82.30% 1.543ms 514.355us 6.497us 51.14% 8.801us 2.934us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.497us 51.14% 6.497us 2.166us 3 - aten::mul 1.76% 33.030us 2.90% 54.310us 18.103us 6.208us 48.86% 6.208us 2.069us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.208us 48.86% 6.208us 2.069us 3 - Activity Buffer Request 76.72% 1.439ms 76.72% 1.439ms 1.439ms 2.304us 18.13% 2.304us 2.304us 1 - aten::slice 2.52% 47.241us 3.15% 59.052us 9.842us 0.000us 0.00% 0.000us 0.000us 6 - aten::as_strided 0.63% 11.811us 0.63% 11.811us 1.968us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 3.34% 62.690us 3.34% 62.690us 10.448us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.43% 8.120us 0.43% 8.120us 8.120us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 188.575us 1476.70% 188.575us 188.575us 1 + torch_eager 11.13% 210.826us 99.56% 1.887ms 1.887ms 0.000us 0.00% 15.106us 15.106us 1 + aten::silu 3.37% 63.781us 82.44% 1.562ms 520.736us 6.497us 50.88% 8.833us 2.944us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.497us 50.88% 6.497us 2.166us 3 + aten::mul 1.86% 35.170us 2.95% 55.841us 18.614us 6.273us 49.12% 6.273us 2.091us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.273us 49.12% 6.273us 2.091us 3 + Activity Buffer Request 76.78% 1.455ms 76.78% 1.455ms 1.455ms 2.336us 18.29% 2.336us 2.336us 1 + aten::slice 2.45% 46.380us 3.05% 57.842us 9.640us 0.000us 0.00% 0.000us 0.000us 6 + aten::as_strided 0.60% 11.462us 0.60% 11.462us 1.910us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 3.38% 64.112us 3.38% 64.112us 10.685us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.44% 8.280us 0.44% 8.280us 8.280us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.875ms -Self CUDA time total: 12.705us +Self CPU time total: 1.895ms +Self CUDA time total: 12.770us @@ -4228,20 +4228,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D1024 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 151.777us 1228.76% 151.777us 151.777us 1 - torch_eager 6.62% 113.831us 99.66% 1.713ms 1.713ms 0.000us 0.00% 14.496us 14.496us 1 - aten::silu 2.46% 42.260us 88.64% 1.523ms 507.722us 6.368us 51.55% 8.512us 2.837us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.368us 51.55% 6.368us 2.123us 3 - aten::mul 1.53% 26.241us 2.60% 44.713us 14.904us 5.984us 48.45% 5.984us 1.995us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.984us 48.45% 5.984us 1.995us 3 - Activity Buffer Request 84.63% 1.454ms 84.63% 1.454ms 1.454ms 2.144us 17.36% 2.144us 2.144us 1 - aten::slice 1.45% 24.880us 1.80% 30.920us 5.153us 0.000us 0.00% 0.000us 0.000us 6 - aten::as_strided 0.35% 6.040us 0.35% 6.040us 1.007us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 2.62% 45.062us 2.62% 45.062us 7.510us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.34% 5.800us 0.34% 5.800us 5.800us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 152.926us 1234.87% 152.926us 152.926us 1 + torch_eager 6.55% 113.093us 99.67% 1.721ms 1.721ms 0.000us 0.00% 14.560us 14.560us 1 + aten::silu 2.40% 41.391us 88.69% 1.532ms 510.609us 6.400us 51.68% 8.576us 2.859us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.400us 51.68% 6.400us 2.133us 3 + aten::mul 1.50% 25.830us 2.63% 45.361us 15.120us 5.984us 48.32% 5.984us 1.995us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.984us 48.32% 5.984us 1.995us 3 + Activity Buffer Request 84.72% 1.463ms 84.72% 1.463ms 1.463ms 2.176us 17.57% 2.176us 2.176us 1 + aten::slice 1.43% 24.741us 1.80% 31.062us 5.177us 0.000us 0.00% 0.000us 0.000us 6 + aten::as_strided 0.37% 6.321us 0.37% 6.321us 1.054us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 2.71% 46.721us 2.71% 46.721us 7.787us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.33% 5.741us 0.33% 5.741us 5.741us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.718ms -Self CUDA time total: 12.352us +Self CPU time total: 1.727ms +Self CUDA time total: 12.384us @@ -4251,20 +4251,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D2048 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 151.422us 1145.66% 151.422us 151.422us 1 - torch_eager 6.39% 108.591us 99.69% 1.694ms 1.694ms 0.000us 0.00% 15.489us 15.489us 1 - aten::silu 2.42% 41.180us 88.84% 1.509ms 503.045us 6.784us 51.33% 9.056us 3.019us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.784us 51.33% 6.784us 2.261us 3 - aten::mul 1.56% 26.573us 2.72% 46.263us 15.421us 6.433us 48.67% 6.433us 2.144us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.433us 48.67% 6.433us 2.144us 3 - Activity Buffer Request 84.90% 1.442ms 84.90% 1.442ms 1.442ms 2.272us 17.19% 2.272us 2.272us 1 - aten::slice 1.42% 24.110us 1.74% 29.570us 4.928us 0.000us 0.00% 0.000us 0.000us 6 - aten::as_strided 0.32% 5.460us 0.32% 5.460us 0.910us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 2.67% 45.420us 2.67% 45.420us 7.570us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.31% 5.240us 0.31% 5.240us 5.240us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 152.413us 1147.86% 152.413us 152.413us 1 + torch_eager 6.17% 105.134us 99.68% 1.699ms 1.699ms 0.000us 0.00% 15.581us 15.581us 1 + aten::silu 2.58% 43.990us 88.96% 1.517ms 505.533us 6.814us 51.32% 9.117us 3.039us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.814us 51.32% 6.814us 2.271us 3 + aten::mul 1.63% 27.711us 2.72% 46.371us 15.457us 6.464us 48.68% 6.464us 2.155us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.464us 48.68% 6.464us 2.155us 3 + Activity Buffer Request 84.84% 1.446ms 84.84% 1.446ms 1.446ms 2.303us 17.34% 2.303us 2.303us 1 + aten::slice 1.47% 24.990us 1.83% 31.250us 5.208us 0.000us 0.00% 0.000us 0.000us 6 + aten::as_strided 0.37% 6.260us 0.37% 6.260us 1.043us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 2.63% 44.871us 2.63% 44.871us 7.478us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.32% 5.431us 0.32% 5.431us 5.431us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.699ms -Self CUDA time total: 13.217us +Self CPU time total: 1.705ms +Self CUDA time total: 13.278us @@ -4274,20 +4274,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D768 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 152.159us 1197.73% 152.159us 152.159us 1 - torch_eager 7.49% 109.251us 99.65% 1.454ms 1.454ms 0.000us 0.00% 14.912us 14.912us 1 - aten::silu 2.87% 41.871us 86.91% 1.268ms 422.724us 6.560us 51.64% 8.768us 2.923us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.560us 51.64% 6.560us 2.187us 3 - aten::mul 1.82% 26.542us 3.09% 45.132us 15.044us 6.144us 48.36% 6.144us 2.048us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.144us 48.36% 6.144us 2.048us 3 - Activity Buffer Request 71.19% 1.039ms 71.19% 1.039ms 1.039ms 2.208us 17.38% 2.208us 2.208us 1 - aten::slice 1.75% 25.480us 2.16% 31.560us 5.260us 0.000us 0.00% 0.000us 0.000us 6 - aten::as_strided 0.42% 6.080us 0.42% 6.080us 1.013us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 14.12% 206.043us 14.12% 206.043us 34.340us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.35% 5.050us 0.35% 5.050us 5.050us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 155.359us 1219.84% 155.359us 155.359us 1 + torch_eager 6.31% 109.593us 99.71% 1.733ms 1.733ms 0.000us 0.00% 14.944us 14.944us 1 + aten::silu 2.48% 43.021us 88.93% 1.545ms 515.160us 6.560us 51.51% 8.768us 2.923us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.560us 51.51% 6.560us 2.187us 3 + aten::mul 1.62% 28.091us 2.66% 46.261us 15.420us 6.176us 48.49% 6.176us 2.059us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.176us 48.49% 6.176us 2.059us 3 + Activity Buffer Request 74.70% 1.298ms 74.70% 1.298ms 1.298ms 2.208us 17.34% 2.208us 2.208us 1 + aten::slice 1.46% 25.370us 1.82% 31.631us 5.272us 0.000us 0.00% 0.000us 0.000us 6 + aten::as_strided 0.36% 6.261us 0.36% 6.261us 1.043us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 12.80% 222.405us 12.80% 222.405us 37.068us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.29% 4.960us 0.29% 4.960us 4.960us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.459ms -Self CUDA time total: 12.704us +Self CPU time total: 1.738ms +Self CUDA time total: 12.736us @@ -4297,20 +4297,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D1024 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 147.295us 1117.14% 147.295us 147.295us 1 - torch_eager 5.91% 105.630us 99.72% 1.782ms 1.782ms 0.000us 0.00% 15.457us 15.457us 1 - aten::silu 2.35% 41.900us 89.64% 1.602ms 533.846us 6.752us 51.21% 9.024us 3.008us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.752us 51.21% 6.752us 2.251us 3 - aten::mul 1.43% 25.502us 2.46% 43.882us 14.627us 6.433us 48.79% 6.433us 2.144us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.433us 48.79% 6.433us 2.144us 3 - Activity Buffer Request 78.53% 1.403ms 78.53% 1.403ms 1.403ms 2.272us 17.23% 2.272us 2.272us 1 - aten::slice 1.39% 24.781us 1.71% 30.582us 5.097us 0.000us 0.00% 0.000us 0.000us 6 - aten::as_strided 0.32% 5.801us 0.32% 5.801us 0.967us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 9.80% 175.053us 9.80% 175.053us 29.176us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.28% 4.969us 0.28% 4.969us 4.969us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 153.122us 1152.94% 153.122us 153.122us 1 + torch_eager 5.95% 108.905us 99.72% 1.827ms 1.827ms 0.000us 0.00% 15.585us 15.585us 1 + aten::silu 2.26% 41.441us 89.57% 1.641ms 546.874us 6.816us 51.32% 9.120us 3.040us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.816us 51.32% 6.816us 2.272us 3 + aten::mul 1.45% 26.581us 2.47% 45.261us 15.087us 6.465us 48.68% 6.465us 2.155us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.465us 48.68% 6.465us 2.155us 3 + Activity Buffer Request 78.54% 1.439ms 78.54% 1.439ms 1.439ms 2.304us 17.35% 2.304us 2.304us 1 + aten::slice 1.41% 25.869us 1.74% 31.870us 5.312us 0.000us 0.00% 0.000us 0.000us 6 + aten::as_strided 0.33% 6.001us 0.33% 6.001us 1.000us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 9.78% 179.164us 9.78% 179.164us 29.861us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.28% 5.090us 0.28% 5.090us 5.090us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.787ms -Self CUDA time total: 13.185us +Self CPU time total: 1.832ms +Self CUDA time total: 13.281us @@ -4320,20 +4320,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D2048 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 143.964us 937.33% 143.964us 143.964us 1 - torch_eager 21.41% 103.402us 98.95% 477.918us 477.918us 0.000us 0.00% 18.047us 18.047us 1 - aten::silu 9.04% 43.640us 62.61% 302.394us 100.798us 7.872us 51.25% 10.560us 3.520us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.872us 51.25% 7.872us 2.624us 3 - aten::mul 5.13% 24.761us 8.85% 42.722us 14.241us 7.487us 48.75% 7.487us 2.496us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.487us 48.75% 7.487us 2.496us 3 - Activity Buffer Request 22.09% 106.692us 22.09% 106.692us 106.692us 2.688us 17.50% 2.688us 2.688us 1 - aten::slice 4.94% 23.880us 6.09% 29.400us 4.900us 0.000us 0.00% 0.000us 0.000us 6 - aten::as_strided 1.14% 5.520us 1.14% 5.520us 0.920us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 35.20% 170.023us 35.20% 170.023us 28.337us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 1.05% 5.060us 1.05% 5.060us 5.060us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 150.877us 970.08% 150.877us 150.877us 1 + torch_eager 20.61% 104.763us 99.03% 503.283us 503.283us 0.000us 0.00% 18.241us 18.241us 1 + aten::silu 8.60% 43.701us 63.19% 321.148us 107.049us 7.969us 51.24% 10.657us 3.552us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.969us 51.24% 7.969us 2.656us 3 + aten::mul 5.45% 27.720us 8.99% 45.690us 15.230us 7.584us 48.76% 7.584us 2.528us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.584us 48.76% 7.584us 2.528us 3 + Activity Buffer Request 24.24% 123.213us 24.24% 123.213us 123.213us 2.688us 17.28% 2.688us 2.688us 1 + aten::slice 5.04% 25.603us 6.23% 31.682us 5.280us 0.000us 0.00% 0.000us 0.000us 6 + aten::as_strided 1.20% 6.079us 1.20% 6.079us 1.013us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 33.88% 172.204us 33.88% 172.204us 28.701us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.97% 4.940us 0.97% 4.940us 4.940us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 482.978us -Self CUDA time total: 15.359us +Self CPU time total: 508.223us +Self CUDA time total: 15.553us @@ -4343,20 +4343,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D768 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 154.301us 1078.65% 154.301us 154.301us 1 - torch_eager 5.96% 107.399us 99.74% 1.796ms 1.796ms 0.000us 0.00% 16.769us 16.769us 1 - aten::silu 2.38% 42.931us 89.51% 1.612ms 537.266us 7.328us 51.23% 9.792us 3.264us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.328us 51.23% 7.328us 2.443us 3 - aten::mul 1.49% 26.893us 2.55% 45.883us 15.294us 6.977us 48.77% 6.977us 2.326us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.977us 48.77% 6.977us 2.326us 3 - Activity Buffer Request 78.67% 1.417ms 78.67% 1.417ms 1.417ms 2.464us 17.22% 2.464us 2.464us 1 - aten::slice 1.40% 25.140us 1.72% 31.031us 5.172us 0.000us 0.00% 0.000us 0.000us 6 - aten::as_strided 0.33% 5.891us 0.33% 5.891us 0.982us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 9.51% 171.283us 9.51% 171.283us 28.547us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.26% 4.600us 0.26% 4.600us 4.600us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 156.541us 1089.44% 156.541us 156.541us 1 + torch_eager 6.81% 125.673us 99.72% 1.840ms 1.840ms 0.000us 0.00% 16.866us 16.866us 1 + aten::silu 2.28% 42.101us 88.57% 1.634ms 544.654us 7.361us 51.23% 9.858us 3.286us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.361us 51.23% 7.361us 2.454us 3 + aten::mul 1.53% 28.200us 2.53% 46.622us 15.541us 7.008us 48.77% 7.008us 2.336us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.008us 48.77% 7.008us 2.336us 3 + Activity Buffer Request 77.96% 1.438ms 77.96% 1.438ms 1.438ms 2.497us 17.38% 2.497us 2.497us 1 + aten::slice 1.46% 26.979us 1.81% 33.310us 5.552us 0.000us 0.00% 0.000us 0.000us 6 + aten::as_strided 0.34% 6.331us 0.34% 6.331us 1.055us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 9.33% 172.076us 9.33% 172.076us 28.679us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.28% 5.210us 0.28% 5.210us 5.210us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.801ms -Self CUDA time total: 14.305us +Self CPU time total: 1.845ms +Self CUDA time total: 14.369us @@ -4366,20 +4366,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D1024 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 154.686us 1002.89% 154.686us 154.686us 1 - torch_eager 22.31% 107.382us 99.03% 476.668us 476.668us 0.000us 0.00% 18.080us 18.080us 1 - aten::silu 9.43% 45.390us 60.13% 289.404us 96.468us 7.872us 51.04% 10.528us 3.509us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.872us 51.04% 7.872us 2.624us 3 - aten::mul 6.54% 31.461us 10.39% 50.022us 16.674us 7.552us 48.96% 7.552us 2.517us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.552us 48.96% 7.552us 2.517us 3 - Activity Buffer Request 19.41% 93.401us 19.41% 93.401us 93.401us 2.656us 17.22% 2.656us 2.656us 1 - aten::slice 5.01% 24.090us 6.20% 29.860us 4.977us 0.000us 0.00% 0.000us 0.000us 6 - aten::as_strided 1.20% 5.770us 1.20% 5.770us 0.962us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 35.15% 169.174us 35.15% 169.174us 28.196us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.97% 4.650us 0.97% 4.650us 4.650us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 149.754us 962.92% 149.754us 149.754us 1 + torch_eager 21.77% 106.163us 98.85% 481.952us 481.952us 0.000us 0.00% 18.240us 18.240us 1 + aten::silu 8.65% 42.151us 61.90% 301.788us 100.596us 7.968us 51.23% 10.656us 3.552us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.968us 51.23% 7.968us 2.656us 3 + aten::mul 5.09% 24.801us 8.77% 42.752us 14.251us 7.584us 48.77% 7.584us 2.528us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.584us 48.77% 7.584us 2.528us 3 + Activity Buffer Request 21.73% 105.953us 21.73% 105.953us 105.953us 2.688us 17.28% 2.688us 2.688us 1 + aten::slice 5.14% 25.050us 6.41% 31.249us 5.208us 0.000us 0.00% 0.000us 0.000us 6 + aten::as_strided 1.27% 6.199us 1.27% 6.199us 1.033us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 35.20% 171.635us 35.20% 171.635us 28.606us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 1.15% 5.600us 1.15% 5.600us 5.600us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 481.318us -Self CUDA time total: 15.424us +Self CPU time total: 487.552us +Self CUDA time total: 15.552us @@ -4389,20 +4389,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D2048 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 155.678us 692.09% 155.678us 155.678us 1 - torch_eager 6.04% 109.222us 99.73% 1.805ms 1.805ms 0.000us 0.00% 26.365us 26.365us 1 - aten::silu 2.28% 41.351us 89.49% 1.620ms 539.866us 11.614us 51.63% 15.485us 5.162us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.614us 51.63% 11.614us 3.871us 3 - aten::mul 1.47% 26.681us 2.47% 44.641us 14.880us 10.880us 48.37% 10.880us 3.627us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.880us 48.37% 10.880us 3.627us 3 - Activity Buffer Request 78.73% 1.425ms 78.73% 1.425ms 1.425ms 3.871us 17.21% 3.871us 3.871us 1 - aten::slice 1.39% 25.188us 1.73% 31.390us 5.232us 0.000us 0.00% 0.000us 0.000us 6 - aten::as_strided 0.34% 6.202us 0.34% 6.202us 1.034us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 9.47% 171.352us 9.47% 171.352us 28.559us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.27% 4.900us 0.27% 4.900us 4.900us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 187.357us 834.00% 187.357us 187.357us 1 + torch_eager 6.93% 128.860us 99.74% 1.856ms 1.856ms 0.000us 0.00% 26.369us 26.369us 1 + aten::silu 2.32% 43.123us 88.23% 1.642ms 547.175us 11.616us 51.71% 15.520us 5.173us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.616us 51.71% 11.616us 3.872us 3 + aten::mul 1.63% 30.312us 2.74% 50.922us 16.974us 10.849us 48.29% 10.849us 3.616us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.849us 48.29% 10.849us 3.616us 3 + Activity Buffer Request 77.79% 1.447ms 77.79% 1.447ms 1.447ms 3.904us 17.38% 3.904us 3.904us 1 + aten::slice 1.49% 27.691us 1.84% 34.251us 5.708us 0.000us 0.00% 0.000us 0.000us 6 + aten::as_strided 0.35% 6.560us 0.35% 6.560us 1.093us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 9.23% 171.734us 9.23% 171.734us 28.622us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.26% 4.930us 0.26% 4.930us 4.930us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.810ms -Self CUDA time total: 22.494us +Self CPU time total: 1.860ms +Self CUDA time total: 22.465us impl wl p50(ms) ok @@ -4419,7 +4419,7 @@ torch_eager cuda_T512_D768 0.05 True
▶ UV Install Logs
diff --git a/activation/results/artifacts/combine/latency.svg b/activation/results/artifacts/combine/latency.svg index 961d35bc69df12d3f8c1e9441cc14de8f19fb723..c90094a9212ed4b3ea466620aa29c029e98de04f 100644 --- a/activation/results/artifacts/combine/latency.svg +++ b/activation/results/artifacts/combine/latency.svg @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:49127439c8b28e18efed1525d57e9bb48bdb632034f2f84a60940f7d447aff24 -size 20647 +oid sha256:085b4a64bddea2955d6d074836121ec2e120fb1ca9140f3ccb75e8358e4526b3 +size 20644 diff --git a/activation/results/combined_results.html b/activation/results/combined_results.html index 2ed3e05955eb7f6d843de731dbef9c8c20788b83..aefcf7c048ef413bda722db3be44aa8b9b9cef43 100644 --- a/activation/results/combined_results.html +++ b/activation/results/combined_results.html @@ -4107,7 +4107,7 @@ body[data-tool="eraser"] .main-content { - 2025-10-30T15:53:40.869549 + 2025-10-31T20:14:01.265668 image/svg+xml @@ -4256,83 +4256,83 @@ body[data-tool="eraser"] .main-content { - + - + - 0.025 + 0.025 - + - + - 0.030 + 0.030 - + - + - 0.035 + 0.035 - + - + - 0.040 + 0.040 - + - + - 0.045 + 0.045 - + - + - 0.050 + 0.050 @@ -4340,37 +4340,37 @@ body[data-tool="eraser"] .main-content { - + - - - - - - - - + + + + + + + + - + - + - - - - - - - + + + + + + + @@ -4428,7 +4428,7 @@ body[data-tool="eraser"] .main-content { ▼ output ▶ uv-logs | -Cell: combine | 4.28s +Cell: combine | 4.32s | Raw @@ -4554,7 +4554,7 @@ Implementations included:
▶ UV Install Logs
@@ -4567,7 +4567,7 @@ Installed 37 packages in 222ms - 2025-10-30T15:53:40.869549 + 2025-10-31T20:14:01.265668 image/svg+xml @@ -4716,83 +4716,83 @@ Installed 37 packages in 222ms - + - + - 0.025 + 0.025 - + - + - 0.030 + 0.030 - + - + - 0.035 + 0.035 - + - + - 0.040 + 0.040 - + - + - 0.045 + 0.045 - + - + - 0.050 + 0.050 @@ -4800,37 +4800,37 @@ Installed 37 packages in 222ms - + - - - - - - - - + + + + + + + + - + - + - - - - - - - + + + + + + + diff --git a/causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl b/causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl index 4dbcd737042ccd89af4999232ce91680c8569342..7bfddcfb2c66ba429fccc98758725309b85f6780 100644 --- a/causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl +++ b/causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl @@ -1,24 +1,24 @@ -{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S128_W2", "batch": 2, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04394999996293336, "p50": 0.04566100000147344, "p90": 0.046750000024076144, "mean": 0.04579239999884521, "iqr": 0.0020500000346146408, "raw_times": [0.0446999999894615, 0.047901000016281614, 0.046750000024076144, 0.04566100000147344, 0.04394999996293336], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05609099997627709, "peak_bytes": 295936, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S128_W4", "batch": 2, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05193099997313766, "p50": 0.05449100001442275, "p90": 0.054510999973444996, "mean": 0.05559319998837964, "iqr": 0.0010200000133409048, "raw_times": [0.05349099996010409, 0.05449100001442275, 0.06354200002078869, 0.05193099997313766, 0.054510999973444996], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.060221000012461445, "peak_bytes": 296448, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S512_W2", "batch": 2, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.051560999963840004, "p50": 0.05184100001542902, "p90": 0.05310099999178419, "mean": 0.05230499999697713, "iqr": 0.0014099999816608033, "raw_times": [0.05184100001542902, 0.05333100000370905, 0.05310099999178419, 0.05169100001012339, 0.051560999963840004], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.058330999991085264, "peak_bytes": 1180672, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S512_W4", "batch": 2, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05121099997040801, "p50": 0.051831000007496186, "p90": 0.052310999990368146, "mean": 0.05185479999454401, "iqr": 0.0008799999591246888, "raw_times": [0.05121099997040801, 0.051831000007496186, 0.052310999990368146, 0.05248999997320425, 0.05143100003124346], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05627100000538121, "peak_bytes": 1181184, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S2048_W2", "batch": 2, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.050751000003401714, "p50": 0.051640999970459234, "p90": 0.05217000000357075, "mean": 0.05161080000561924, "iqr": 0.0008689999617672584, "raw_times": [0.05219100000886101, 0.05217000000357075, 0.050751000003401714, 0.05130100004180349, 0.051640999970459234], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.055421000013211597, "peak_bytes": 4719616, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S2048_W4", "batch": 2, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04883100001507046, "p50": 0.049950999994052836, "p90": 0.05039000001261229, "mean": 0.04992260001017712, "iqr": 0.0006600000119760807, "raw_times": [0.04883100001507046, 0.05071100002851381, 0.04973000000063621, 0.05039000001261229, 0.049950999994052836], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05684100000280523, "peak_bytes": 4720128, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S128_W2", "batch": 2, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04895099999657759, "p50": 0.050181000005977694, "p90": 0.05176100000880979, "mean": 0.05066500000339147, "iqr": 0.0021600000081889448, "raw_times": [0.04960100000062084, 0.05176100000880979, 0.050181000005977694, 0.05283100000497143, 0.04895099999657759], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05629100002124687, "peak_bytes": 9461760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S128_W4", "batch": 2, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.048800999991271965, "p50": 0.051240999994206504, "p90": 0.0513809999915793, "mean": 0.05085500000632237, "iqr": 0.00043999995114063495, "raw_times": [0.051240999994206504, 0.048800999991271965, 0.051911000014115416, 0.050941000040438666, 0.0513809999915793], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056131000008008414, "peak_bytes": 9478144, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S512_W2", "batch": 2, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04940100001249448, "p50": 0.05085099996904319, "p90": 0.05221100002472667, "mean": 0.05112659999895186, "iqr": 0.0015410000742122065, "raw_times": [0.050669999950514466, 0.05221100002472667, 0.04940100001249448, 0.0525000000379805, 0.05085099996904319], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.053861000026245165, "peak_bytes": 37773312, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S512_W4", "batch": 2, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04947999997284569, "p50": 0.05073100004437947, "p90": 0.05098100001532657, "mean": 0.05063280001422754, "iqr": 0.0010900000120273035, "raw_times": [0.04947999997284569, 0.05098100001532657, 0.04989100000329927, 0.05073100004437947, 0.052081000035286706], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.054841000007854745, "peak_bytes": 37789696, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S2048_W2", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05039100000203689, "p50": 0.051160999987587275, "p90": 0.05154000001539316, "mean": 0.051364599994485616, "iqr": 0.00038000001723048626, "raw_times": [0.051160999987587275, 0.05257099996924808, 0.05039100000203689, 0.05154000001539316, 0.051159999998162675], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05513099995368975, "peak_bytes": 151019520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S2048_W4", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.048071000037452905, "p50": 0.05178100002467545, "p90": 0.0526809999996658, "mean": 0.05150900001353875, "iqr": 0.0032599999713056604, "raw_times": [0.04942100002836014, 0.0526809999996658, 0.05178100002467545, 0.05559099997753947, 0.048071000037452905], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05527100000790597, "peak_bytes": 151035904, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S128_W2", "batch": 4, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04963099996757592, "p50": 0.05203099999562255, "p90": 0.052549999963957816, "mean": 0.05276679999042244, "iqr": 0.0005189999683352653, "raw_times": [0.05759100002933337, 0.05203099999562255, 0.052549999963957816, 0.04963099996757592, 0.05203099999562255], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.07661199998665325, "peak_bytes": 33727488, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S128_W4", "batch": 4, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.049701000023105735, "p50": 0.051581000036549085, "p90": 0.05290100000365783, "mean": 0.05255880001868718, "iqr": 0.002381000001605571, "raw_times": [0.05290100000365783, 0.058091000028070994, 0.051581000036549085, 0.05052000000205226, 0.049701000023105735], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.054920000025049376, "peak_bytes": 591360, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S512_W2", "batch": 4, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0517009999612128, "p50": 0.05219999997052582, "p90": 0.05233100000623381, "mean": 0.05215079999061345, "iqr": 0.0001500000053056283, "raw_times": [0.05233100000623381, 0.05234100001416664, 0.05219999997052582, 0.0517009999612128, 0.05218100000092818], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.055141000018466, "peak_bytes": 2360320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S512_W4", "batch": 4, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05047100000865612, "p50": 0.05349100001694751, "p90": 0.05691100000149163, "mean": 0.057148999997025385, "iqr": 0.004350000040176383, "raw_times": [0.05047100000865612, 0.05349100001694751, 0.07231099999671642, 0.05256099996131525, 0.05691100000149163], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05554099999471873, "peak_bytes": 2360832, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S2048_W2", "batch": 4, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.049690000025748304, "p50": 0.050921000024573004, "p90": 0.051730999985011294, "mean": 0.051232800001344, "iqr": 0.0010800000040944724, "raw_times": [0.05065099998091682, 0.051730999985011294, 0.05317099999047059, 0.049690000025748304, 0.050921000024573004], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05373099997996178, "peak_bytes": 9438208, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S2048_W4", "batch": 4, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05013100002315696, "p50": 0.05073099998753605, "p90": 0.052470999946763186, "mean": 0.051448999988679134, "iqr": 0.001829999973779195, "raw_times": [0.05013100002315696, 0.05073099998753605, 0.05327100001295548, 0.052470999946763186, 0.05064099997298399], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05419999996547631, "peak_bytes": 9438720, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S128_W2", "batch": 4, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04918100000850245, "p50": 0.050670999996782484, "p90": 0.05192099996520483, "mean": 0.050938799995492445, "iqr": 0.0013709999393540784, "raw_times": [0.04918100000850245, 0.05192099996520483, 0.05237099998112171, 0.05055000002585075, 0.050670999996782484], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05559099997753947, "peak_bytes": 18931712, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S128_W4", "batch": 4, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.049871000044277025, "p50": 0.05047100000865612, "p90": 0.05118100000345294, "mean": 0.050820800015571876, "iqr": 0.0007699999855503847, "raw_times": [0.049871000044277025, 0.05041100001790255, 0.05217000000357075, 0.05047100000865612, 0.05118100000345294], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05564100001720362, "peak_bytes": 18948096, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S512_W2", "batch": 4, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05115100003649786, "p50": 0.052071000027353875, "p90": 0.05212100001017461, "mean": 0.05199700001412566, "iqr": 0.0006100000291553442, "raw_times": [0.05115100003649786, 0.052071000027353875, 0.053131000015582686, 0.05212100001017461, 0.05151099998101927], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05440099999987069, "peak_bytes": 75522048, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S512_W4", "batch": 4, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04927099996621109, "p50": 0.051500999973086437, "p90": 0.05194099998107049, "mean": 0.05114499998626343, "iqr": 0.000919999990856013, "raw_times": [0.051500999973086437, 0.04927099996621109, 0.051991000020734646, 0.05194099998107049, 0.05102099999021448], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.054591000036907644, "peak_bytes": 75538432, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S2048_W2", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.049930999978187174, "p50": 0.050361000035081815, "p90": 0.05102099999021448, "mean": 0.05066480000550655, "iqr": 0.0008009999987734773, "raw_times": [0.050219999991441, 0.050361000035081815, 0.05179100003260828, 0.049930999978187174, 0.05102099999021448], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05545099998016667, "peak_bytes": 302014464, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} -{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S2048_W4", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0500799999940682, "p50": 0.05195099998900332, "p90": 0.051991000020734646, "mean": 0.05318280000210507, "iqr": 0.0014600000213249587, "raw_times": [0.0500799999940682, 0.05195099998900332, 0.051991000020734646, 0.05053099999940969, 0.061361000007309485], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05489099999067548, "peak_bytes": 302030848, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S128_W2", "batch": 2, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.06906199996592477, "p50": 0.07093199997143529, "p90": 0.07169200000589626, "mean": 0.07107379998387842, "iqr": 0.0011000000199601345, "raw_times": [0.07093199997143529, 0.07309099999019963, 0.07059199998593613, 0.07169200000589626, 0.06906199996592477], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.07642200000645971, "peak_bytes": 295936, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S128_W4", "batch": 2, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08730199999718025, "p50": 0.08879199998546028, "p90": 0.08886199998414668, "mean": 0.0890762000040013, "iqr": 0.00037899997096246807, "raw_times": [0.08730199999718025, 0.08879199998546028, 0.08848300001318421, 0.08886199998414668, 0.09194200004003505], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.091862999965997, "peak_bytes": 296448, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S512_W2", "batch": 2, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08465199999818651, "p50": 0.08821300002637145, "p90": 0.08871199997884105, "mean": 0.08770840000806857, "iqr": 0.0007599999776175537, "raw_times": [0.08465199999818651, 0.0879520000012235, 0.08821300002637145, 0.08901300003572032, 0.08871199997884105], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09156300001222917, "peak_bytes": 1180672, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S512_W4", "batch": 2, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08501199999955134, "p50": 0.08710200000905388, "p90": 0.08719199996676252, "mean": 0.08665020000080403, "iqr": 0.001349999934063817, "raw_times": [0.08501199999955134, 0.08710200000905388, 0.08719199996676252, 0.0858420000326987, 0.08810299999595372], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09103200000026845, "peak_bytes": 1181184, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S2048_W2", "batch": 2, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08575200001814665, "p50": 0.08690200002092752, "p90": 0.08706200003416598, "mean": 0.08684220001669019, "iqr": 0.00029900002118665725, "raw_times": [0.08773199999723147, 0.08676300001297932, 0.08690200002092752, 0.08706200003416598, 0.08575200001814665], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09036199998035954, "peak_bytes": 4719616, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S2048_W4", "batch": 2, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08490200002597703, "p50": 0.08731200000511308, "p90": 0.0877829999694768, "mean": 0.08806820000017979, "iqr": 0.001451000002816727, "raw_times": [0.09401200003367194, 0.08731200000511308, 0.08633199996666008, 0.08490200002597703, 0.0877829999694768], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0907329999790818, "peak_bytes": 4720128, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S128_W2", "batch": 2, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0847820000444699, "p50": 0.08513199998105847, "p90": 0.08660200001031626, "mean": 0.08566600000676772, "iqr": 0.0016600000094513234, "raw_times": [0.08494200000086494, 0.0847820000444699, 0.08687199999712902, 0.08660200001031626, 0.08513199998105847], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0911219999579771, "peak_bytes": 9461760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S128_W4", "batch": 2, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08356199998615921, "p50": 0.0846430000365217, "p90": 0.08576199996923606, "mean": 0.08508039999242101, "iqr": 0.0011189999895577785, "raw_times": [0.08356199998615921, 0.0867919999905098, 0.08464299997967828, 0.08576199996923606, 0.0846430000365217], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08955300000934585, "peak_bytes": 9478144, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S512_W2", "batch": 2, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08469199997307442, "p50": 0.08614199998646654, "p90": 0.08723299998791845, "mean": 0.08654439999418173, "iqr": 0.0011309999763398082, "raw_times": [0.08469199997307442, 0.08610200001157864, 0.08614199998646654, 0.08855300001187061, 0.08723299998791845], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09115300002804361, "peak_bytes": 37773312, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S512_W4", "batch": 2, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08576300001550408, "p50": 0.08703200001036748, "p90": 0.08823299998539369, "mean": 0.09075460000076419, "iqr": 0.0015310000094359566, "raw_times": [0.10604300001659794, 0.08823299998539369, 0.08703200001036748, 0.08670199997595773, 0.08576300001550408], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08985199997368909, "peak_bytes": 37789696, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S2048_W2", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.14525299997103502, "p50": 0.1457439999512644, "p90": 0.1459139999724357, "mean": 0.1457395999750588, "iqr": 0.00044099999740865314, "raw_times": [0.14525299997103502, 0.14547299997502705, 0.1457439999512644, 0.14631400000553185, 0.1459139999724357], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.1472430000148961, "peak_bytes": 151019520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S2048_W4", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.16037399996093882, "p50": 0.16231400002197915, "p90": 0.16309400001546237, "mean": 0.1622881999992387, "iqr": 0.0012190000120426703, "raw_times": [0.16309400001546237, 0.16231400002197915, 0.16378399999439353, 0.1618750000034197, 0.16037399996093882], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.16341399998509587, "peak_bytes": 151035904, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S128_W2", "batch": 4, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08445299999948475, "p50": 0.08518200002072263, "p90": 0.08666200000106983, "mean": 0.08572240001285536, "iqr": 0.0017899999988912896, "raw_times": [0.08445299999948475, 0.08744300004082106, 0.08518200002072263, 0.08666200000106983, 0.08487200000217854], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0890119999894523, "peak_bytes": 33727488, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S128_W4", "batch": 4, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08437200000344092, "p50": 0.08463200003916427, "p90": 0.08609200000364581, "mean": 0.08522400000856578, "iqr": 0.0015900000107649248, "raw_times": [0.08463200003916427, 0.08609200000364581, 0.08652200000369703, 0.08437200000344092, 0.08450199999288088], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08977199996706986, "peak_bytes": 591360, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S512_W2", "batch": 4, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08375199996635274, "p50": 0.08519199997181204, "p90": 0.08627200003274993, "mean": 0.08607399998936671, "iqr": 0.0020100000597267353, "raw_times": [0.08375199996635274, 0.0842619999730232, 0.08627200003274993, 0.08519199997181204, 0.09089200000289566], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08821199998010343, "peak_bytes": 2360320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S512_W4", "batch": 4, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08470200003785067, "p50": 0.08566200000359458, "p90": 0.08573299999170558, "mean": 0.08566220001284819, "iqr": 0.0006109999617365247, "raw_times": [0.08470200003785067, 0.08709200000112105, 0.08512200002996906, 0.08566200000359458, 0.08573299999170558], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08864200003699807, "peak_bytes": 2360832, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S2048_W2", "batch": 4, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08451200000081371, "p50": 0.08525300000883362, "p90": 0.08580199994412396, "mean": 0.08525219999455658, "iqr": 0.0009299999419454252, "raw_times": [0.08580199994412396, 0.08525300000883362, 0.08451200000081371, 0.08487200000217854, 0.08582200001683304], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08942300001990588, "peak_bytes": 9438208, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S2048_W4", "batch": 4, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08533199996918484, "p50": 0.08693199998788259, "p90": 0.09015199998430035, "mean": 0.08883799998784525, "iqr": 0.0043200000163778896, "raw_times": [0.08533199996918484, 0.09015199998430035, 0.08583199996792246, 0.08693199998788259, 0.09594200002993603], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09176200001093093, "peak_bytes": 9438720, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S128_W2", "batch": 4, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08384200003774822, "p50": 0.08611200001951147, "p90": 0.08663199997727133, "mean": 0.08570400000280642, "iqr": 0.001730000008137722, "raw_times": [0.08384200003774822, 0.08611200001951147, 0.08703200001036748, 0.08663199997727133, 0.08490199996913361], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08941200002254845, "peak_bytes": 18931712, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S128_W4", "batch": 4, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08507300003657292, "p50": 0.0865819999944506, "p90": 0.08741199997075455, "mean": 0.09195439998848087, "iqr": 0.0020300000187489786, "raw_times": [0.11532299998862072, 0.0865819999944506, 0.08741199997075455, 0.08538199995200557, 0.08507300003657292], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08733200002097874, "peak_bytes": 18948096, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S512_W2", "batch": 4, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09419299999535724, "p50": 0.09539199999153425, "p90": 0.09730299996135727, "mean": 0.09678459998667677, "iqr": 0.002380999944762152, "raw_times": [0.10211299996853995, 0.09730299996135727, 0.09492200001659512, 0.09539199999153425, 0.09419299999535724], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09651299995994123, "peak_bytes": 75522048, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S512_W4", "batch": 4, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.10080199996309602, "p50": 0.10192199999892182, "p90": 0.1026219999857858, "mean": 0.10294419998899684, "iqr": 0.0008999999749903509, "raw_times": [0.10765299998638511, 0.10172200001079545, 0.1026219999857858, 0.10192199999892182, 0.10080199996309602], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.10299199999508346, "peak_bytes": 75538432, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S2048_W2", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.4861929999719905, "p50": 0.4890019999947981, "p90": 0.48961200002395344, "mean": 0.48862639999924795, "iqr": 0.001079000014669873, "raw_times": [0.48979199999621414, 0.4861929999719905, 0.48961200002395344, 0.4890019999947981, 0.48853300000928357], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.48705300002893637, "peak_bytes": 302014464, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S2048_W4", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.49736299996538946, "p50": 0.49848299994437184, "p90": 0.49918199999865465, "mean": 0.4987367999774506, "iqr": 0.0007590000450363732, "raw_times": [0.4984229999536183, 0.49848299994437184, 0.49918199999865465, 0.5002330000252186, 0.49736299996538946], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.4985730000157673, "peak_bytes": 302030848, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} diff --git a/causal_conv1d/impls/cells/benchmark.py b/causal_conv1d/impls/cells/benchmark.py index 725b12c4018e4eec05c5ddccb0c88a8eae6f150d..2e38669a505cbdf181a93e97f31ed1e67ecf4883 100644 --- a/causal_conv1d/impls/cells/benchmark.py +++ b/causal_conv1d/impls/cells/benchmark.py @@ -4,28 +4,37 @@ # "numpy", # "torch==2.8.0", # "kernels-benchmark-tools", -# "kernels", # ] # # [tool.uv.sources] # kernels-benchmark-tools = { path = "../../../../../tools", editable = true } # /// import torch +import torch.nn.functional as F import sys from kernels_benchmark_tools import KernelTypeEnum, run_benchmark -from kernels import get_kernel -# Load the causal conv1d kernel -causal_conv1d = get_kernel("kernels-community/causal-conv1d") +def torch_causal_conv1d(input_tensor, weight, bias): + # Convert to weight dtype for computation + x = input_tensor.to(weight.dtype) + dim = weight.shape[0] + width = weight.shape[1] + seqlen = input_tensor.shape[-1] -def hf_kernels_causal_conv1d(input_tensor, weight, bias): - return causal_conv1d.causal_conv1d_fn(input_tensor, weight, bias) + # Depthwise causal conv1d using PyTorch + out = F.conv1d(x, weight.unsqueeze(1), bias, padding=width - 1, groups=dim) + + # Truncate to original sequence length + out = out[..., :seqlen] + + # Convert back to original dtype + return out.to(input_tensor.dtype) run_benchmark( kernel_type=KernelTypeEnum.CAUSAL_CONV1D, - impl_name="hf_kernels_causal_conv1d", - impl_tags={"family": "hf-kernels", "backend": "cuda"}, - impl_func=hf_kernels_causal_conv1d, + impl_name="torch_eager", + impl_tags={"family": "pytorch", "backend": "eager"}, + impl_func=torch_causal_conv1d, ) \ No newline at end of file diff --git a/causal_conv1d/impls/hf_kernels_causal_conv1d.html b/causal_conv1d/impls/hf_kernels_causal_conv1d.html index e161062d07cab205d4d881403fd3310ed83e20ca..cb1bde40be01c47bdde38e8da86912f92e3be9c0 100644 --- a/causal_conv1d/impls/hf_kernels_causal_conv1d.html +++ b/causal_conv1d/impls/hf_kernels_causal_conv1d.html @@ -4106,11 +4106,12 @@ body[data-tool="eraser"] .main-content { ▼ output ▶ uv-logs | -Cell: nv | 0.28s +Cell: nv | 0.21s | Raw -GitHub +GitHub +🤗 HF
@@ -4122,7 +4123,7 @@ Cell: nv | 0.28s
-
Thu Oct 30 15:51:43 2025       
+
Fri Oct 31 20:00:25 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -4131,7 +4132,7 @@ Cell: nv | 0.28s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   27C    P8             22W /  350W |       0MiB /  46068MiB |      0%      Default |
+| N/A   33C    P0             79W /  350W |       0MiB /  46068MiB |     11%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -4155,11 +4156,12 @@ Cell: nv | 0.28s
 ▼ output
  ▶ uv-logs
  | 
-Cell: benchmark | 5.66s
+Cell: benchmark | 9.11s
  | 
 
 Raw
-GitHub
+GitHub
+🤗 HF
 
@@ -4208,19 +4210,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S128_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 148.031us 3643.39% 148.031us 148.031us 1 - hf_kernels_causal_conv1d 8.90% 165.322us 99.57% 1.851ms 1.851ms 0.000us 0.00% 5.503us 5.503us 1 - CausalConv1dFn 5.85% 108.724us 90.68% 1.685ms 561.740us 0.000us 0.00% 5.503us 1.834us 3 - _causal_conv1d_90f5a60::causal_conv1d_fwd 1.35% 25.159us 81.18% 1.509ms 502.865us 4.063us 100.00% 5.503us 1.834us 3 -void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 4.063us 100.00% 4.063us 1.354us 3 - Activity Buffer Request 77.32% 1.437ms 77.32% 1.437ms 1.437ms 1.440us 35.44% 1.440us 1.440us 1 - aten::empty_like 0.95% 17.630us 3.65% 67.900us 22.633us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 2.70% 50.270us 2.70% 50.270us 16.757us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 2.50% 46.532us 2.50% 46.532us 15.511us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.43% 7.900us 0.43% 7.900us 7.900us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 180.703us 4446.43% 180.703us 180.703us 1 + hf_kernels_causal_conv1d 8.48% 160.534us 99.62% 1.886ms 1.886ms 0.000us 0.00% 5.504us 5.504us 1 + CausalConv1dFn 6.47% 122.423us 91.15% 1.726ms 575.261us 0.000us 0.00% 5.504us 1.835us 3 + _causal_conv1d_90f5a60::causal_conv1d_fwd 1.51% 28.612us 80.84% 1.531ms 510.207us 4.064us 100.00% 5.504us 1.835us 3 +void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 4.064us 100.00% 4.064us 1.355us 3 + Activity Buffer Request 76.71% 1.452ms 76.71% 1.452ms 1.452ms 1.440us 35.43% 1.440us 1.440us 1 + aten::empty_like 1.07% 20.220us 3.84% 72.741us 24.247us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 2.77% 52.521us 2.77% 52.521us 17.507us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 2.62% 49.571us 2.62% 49.571us 16.524us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.38% 7.101us 0.38% 7.101us 7.101us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.858ms -Self CUDA time total: 4.063us +Self CPU time total: 1.893ms +Self CUDA time total: 4.064us @@ -4230,19 +4232,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S128_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 120.926us 3229.86% 120.926us 120.926us 1 - hf_kernels_causal_conv1d 5.72% 96.561us 99.68% 1.683ms 1.683ms 0.000us 0.00% 4.992us 4.992us 1 - CausalConv1dFn 4.27% 72.072us 93.97% 1.587ms 528.936us 0.000us 0.00% 4.992us 1.664us 3 - _causal_conv1d_90f5a60::causal_conv1d_fwd 1.50% 25.350us 87.84% 1.483ms 494.459us 3.744us 100.00% 4.992us 1.664us 3 -void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 3.744us 100.00% 3.744us 1.248us 3 - Activity Buffer Request 84.49% 1.427ms 84.49% 1.427ms 1.427ms 1.248us 33.33% 1.248us 1.248us 1 - aten::empty_like 0.48% 8.160us 1.86% 31.360us 10.453us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 1.37% 23.200us 1.37% 23.200us 7.733us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 1.85% 31.292us 1.85% 31.292us 10.431us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.32% 5.320us 0.32% 5.320us 5.320us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 125.791us 3331.33% 125.791us 125.791us 1 + hf_kernels_causal_conv1d 5.58% 96.392us 99.64% 1.721ms 1.721ms 0.000us 0.00% 5.056us 5.056us 1 + CausalConv1dFn 4.40% 76.074us 94.06% 1.625ms 541.671us 0.000us 0.00% 5.056us 1.685us 3 + _causal_conv1d_90f5a60::causal_conv1d_fwd 1.52% 26.231us 87.95% 1.519ms 506.473us 3.776us 100.00% 5.056us 1.685us 3 +void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 3.776us 100.00% 3.776us 1.259us 3 + Activity Buffer Request 84.56% 1.461ms 84.56% 1.461ms 1.461ms 1.280us 33.90% 1.280us 1.280us 1 + aten::empty_like 0.44% 7.590us 1.71% 29.520us 9.840us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 1.27% 21.930us 1.27% 21.930us 7.310us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 1.87% 32.290us 1.87% 32.290us 10.763us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.36% 6.200us 0.36% 6.200us 6.200us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.689ms -Self CUDA time total: 3.744us +Self CPU time total: 1.728ms +Self CUDA time total: 3.776us @@ -4252,18 +4254,18 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S512_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 122.942us 3255.88% 122.942us 122.942us 1 - hf_kernels_causal_conv1d 6.02% 102.400us 99.66% 1.696ms 1.696ms 0.000us 0.00% 5.023us 5.023us 1 - CausalConv1dFn 4.37% 74.304us 93.64% 1.594ms 531.323us 0.000us 0.00% 5.023us 1.674us 3 - _causal_conv1d_90f5a60::causal_conv1d_fwd 1.51% 25.778us 87.51% 1.490ms 496.532us 3.776us 100.00% 5.023us 1.674us 3 + hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 125.758us 3330.46% 125.758us 125.758us 1 + hf_kernels_causal_conv1d 5.23% 90.742us 99.66% 1.729ms 1.729ms 0.000us 0.00% 5.056us 5.056us 1 + CausalConv1dFn 4.39% 76.092us 94.43% 1.638ms 546.081us 0.000us 0.00% 5.056us 1.685us 3 + _causal_conv1d_90f5a60::causal_conv1d_fwd 1.50% 26.031us 88.31% 1.532ms 510.660us 3.776us 100.00% 5.056us 1.685us 3 void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 3.776us 100.00% 3.776us 1.259us 3 - Activity Buffer Request 84.19% 1.433ms 84.19% 1.433ms 1.433ms 1.247us 33.02% 1.247us 1.247us 1 - aten::empty_like 0.48% 8.219us 1.77% 30.070us 10.023us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 1.28% 21.851us 1.28% 21.851us 7.284us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 1.81% 30.742us 1.81% 30.742us 10.247us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.34% 5.821us 0.34% 5.821us 5.821us 0.000us 0.00% 0.000us 0.000us 1 + Activity Buffer Request 84.98% 1.474ms 84.98% 1.474ms 1.474ms 1.280us 33.90% 1.280us 1.280us 1 + aten::empty_like 0.47% 8.201us 1.74% 30.171us 10.057us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 1.27% 21.970us 1.27% 21.970us 7.323us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 1.83% 31.671us 1.83% 31.671us 10.557us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.34% 5.850us 0.34% 5.850us 5.850us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.702ms +Self CPU time total: 1.735ms Self CUDA time total: 3.776us @@ -4274,19 +4276,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S512_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 154.975us 4105.30% 154.975us 154.975us 1 - hf_kernels_causal_conv1d 5.10% 97.113us 99.71% 1.897ms 1.897ms 0.000us 0.00% 5.022us 5.022us 1 - CausalConv1dFn 5.06% 96.320us 94.60% 1.800ms 599.880us 0.000us 0.00% 5.022us 1.674us 3 - _causal_conv1d_90f5a60::causal_conv1d_fwd 1.32% 25.153us 87.78% 1.670ms 556.640us 3.775us 100.00% 5.022us 1.674us 3 -void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 3.775us 100.00% 3.775us 1.258us 3 - Activity Buffer Request 75.43% 1.435ms 75.43% 1.435ms 1.435ms 1.247us 33.03% 1.247us 1.247us 1 - aten::empty_like 0.48% 9.119us 1.76% 33.400us 11.133us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 1.28% 24.281us 1.28% 24.281us 8.094us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 11.03% 209.783us 11.03% 209.783us 69.928us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.29% 5.600us 0.29% 5.600us 5.600us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 127.584us 3350.42% 127.584us 127.584us 1 + hf_kernels_causal_conv1d 4.53% 88.983us 99.75% 1.962ms 1.962ms 0.000us 0.00% 5.088us 5.088us 1 + CausalConv1dFn 3.93% 77.252us 95.23% 1.873ms 624.219us 0.000us 0.00% 5.088us 1.696us 3 + _causal_conv1d_90f5a60::causal_conv1d_fwd 1.36% 26.710us 89.83% 1.766ms 588.805us 3.808us 100.00% 5.088us 1.696us 3 +void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 3.808us 100.00% 3.808us 1.269us 3 + Activity Buffer Request 74.34% 1.462ms 74.34% 1.462ms 1.462ms 1.280us 33.61% 1.280us 1.280us 1 + aten::empty_like 0.41% 8.060us 1.47% 28.990us 9.663us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 1.06% 20.930us 1.06% 20.930us 6.977us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 14.13% 277.777us 14.13% 277.777us 92.592us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.25% 4.831us 0.25% 4.831us 4.831us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.902ms -Self CUDA time total: 3.775us +Self CPU time total: 1.966ms +Self CUDA time total: 3.808us @@ -4296,19 +4298,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S2048_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 127.520us 2656.67% 127.520us 127.520us 1 - hf_kernels_causal_conv1d 5.48% 101.023us 99.67% 1.838ms 1.838ms 0.000us 0.00% 6.400us 6.400us 1 - CausalConv1dFn 4.02% 74.081us 94.20% 1.737ms 579.070us 0.000us 0.00% 6.400us 2.133us 3 - _causal_conv1d_90f5a60::causal_conv1d_fwd 1.41% 25.982us 88.51% 1.632ms 544.113us 4.800us 100.00% 6.400us 2.133us 3 -void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 4.800us 100.00% 4.800us 1.600us 3 - Activity Buffer Request 78.02% 1.439ms 78.02% 1.439ms 1.439ms 1.600us 33.33% 1.600us 1.600us 1 - aten::empty_like 0.45% 8.310us 1.67% 30.790us 10.263us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 1.22% 22.480us 1.22% 22.480us 7.493us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 9.08% 167.462us 9.08% 167.462us 55.821us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.33% 6.020us 0.33% 6.020us 6.020us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 126.686us 2639.84% 126.686us 126.686us 1 + hf_kernels_causal_conv1d 4.55% 87.622us 99.73% 1.920ms 1.920ms 0.000us 0.00% 6.430us 6.430us 1 + CausalConv1dFn 3.92% 75.482us 95.18% 1.832ms 610.789us 0.000us 0.00% 6.430us 2.143us 3 + _causal_conv1d_90f5a60::causal_conv1d_fwd 1.44% 27.663us 89.66% 1.726ms 575.372us 4.799us 100.00% 6.430us 2.143us 3 +void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 4.799us 100.00% 4.799us 1.600us 3 + Activity Buffer Request 74.49% 1.434ms 74.49% 1.434ms 1.434ms 1.631us 33.99% 1.631us 1.631us 1 + aten::empty_like 0.42% 8.140us 1.60% 30.770us 10.257us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 1.18% 22.630us 1.18% 22.630us 7.543us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 13.74% 264.526us 13.74% 264.526us 88.175us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.27% 5.120us 0.27% 5.120us 5.120us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.844ms -Self CUDA time total: 4.800us +Self CPU time total: 1.925ms +Self CUDA time total: 4.799us @@ -4318,19 +4320,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S2048_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 118.208us 2446.36% 118.208us 118.208us 1 - hf_kernels_causal_conv1d 14.10% 77.840us 98.97% 546.449us 546.449us 0.000us 0.00% 6.464us 6.464us 1 - CausalConv1dFn 13.03% 71.942us 84.87% 468.609us 156.203us 0.000us 0.00% 6.464us 2.155us 3 - _causal_conv1d_90f5a60::causal_conv1d_fwd 4.50% 24.830us 66.59% 367.636us 122.545us 4.832us 100.00% 6.464us 2.155us 3 -void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 4.832us 100.00% 4.832us 1.611us 3 - Activity Buffer Request 33.64% 185.743us 33.64% 185.743us 185.743us 1.632us 33.77% 1.632us 1.632us 1 - aten::empty_like 1.44% 7.931us 5.26% 29.031us 9.677us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 3.82% 21.100us 3.82% 21.100us 7.033us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 28.45% 157.063us 28.45% 157.063us 52.354us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 1.03% 5.680us 1.03% 5.680us 5.680us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 117.083us 2423.58% 117.083us 117.083us 1 + hf_kernels_causal_conv1d 12.24% 83.203us 99.28% 674.957us 674.957us 0.000us 0.00% 6.463us 6.463us 1 + CausalConv1dFn 10.43% 70.911us 87.04% 591.754us 197.251us 0.000us 0.00% 6.463us 2.154us 3 + _causal_conv1d_90f5a60::causal_conv1d_fwd 3.93% 26.710us 72.18% 490.682us 163.561us 4.831us 100.00% 6.463us 2.154us 3 +void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 4.831us 100.00% 4.831us 1.610us 3 + Activity Buffer Request 32.42% 220.416us 32.42% 220.416us 220.416us 1.632us 33.78% 1.632us 1.632us 1 + aten::empty_like 1.07% 7.270us 4.44% 30.161us 10.054us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 3.37% 22.891us 3.37% 22.891us 7.630us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 35.83% 243.556us 35.83% 243.556us 81.185us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.72% 4.870us 0.72% 4.870us 4.870us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 552.129us -Self CUDA time total: 4.832us +Self CPU time total: 679.827us +Self CUDA time total: 4.831us @@ -4340,19 +4342,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S128_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 129.887us 1226.27% 129.887us 129.887us 1 - hf_kernels_causal_conv1d 5.23% 95.772us 99.69% 1.826ms 1.826ms 0.000us 0.00% 14.144us 14.144us 1 - CausalConv1dFn 4.13% 75.612us 94.46% 1.730ms 576.726us 0.000us 0.00% 14.144us 4.715us 3 - _causal_conv1d_90f5a60::causal_conv1d_fwd 1.41% 25.780us 88.71% 1.625ms 541.586us 10.592us 100.00% 14.144us 4.715us 3 -void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 10.592us 100.00% 10.592us 3.531us 3 - Activity Buffer Request 78.55% 1.439ms 78.55% 1.439ms 1.439ms 3.552us 33.53% 3.552us 3.552us 1 - aten::empty_like 0.48% 8.780us 1.63% 29.810us 9.937us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 1.15% 21.030us 1.15% 21.030us 7.010us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 8.75% 160.332us 8.75% 160.332us 53.444us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.31% 5.650us 0.31% 5.650us 5.650us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 124.381us 1167.35% 124.381us 124.381us 1 + hf_kernels_causal_conv1d 4.48% 85.542us 99.75% 1.904ms 1.904ms 0.000us 0.00% 14.271us 14.271us 1 + CausalConv1dFn 3.83% 73.182us 95.27% 1.819ms 606.282us 0.000us 0.00% 14.271us 4.757us 3 + _causal_conv1d_90f5a60::causal_conv1d_fwd 1.41% 26.960us 89.88% 1.716ms 571.988us 10.655us 100.00% 14.271us 4.757us 3 +void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 10.655us 100.00% 10.655us 3.552us 3 + Activity Buffer Request 76.01% 1.451ms 76.01% 1.451ms 1.451ms 3.616us 33.94% 3.616us 3.616us 1 + aten::empty_like 0.43% 8.120us 1.56% 29.700us 9.900us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 1.13% 21.580us 1.13% 21.580us 7.193us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 12.45% 237.787us 12.45% 237.787us 79.262us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.25% 4.860us 0.25% 4.860us 4.860us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.832ms -Self CUDA time total: 10.592us +Self CPU time total: 1.909ms +Self CUDA time total: 10.655us @@ -4362,19 +4364,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S128_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 119.356us 1093.80% 119.356us 119.356us 1 - hf_kernels_causal_conv1d 19.79% 94.221us 98.72% 469.928us 469.928us 0.000us 0.00% 14.592us 14.592us 1 - CausalConv1dFn 14.74% 70.172us 78.93% 375.707us 125.236us 0.000us 0.00% 14.592us 4.864us 3 - _causal_conv1d_90f5a60::causal_conv1d_fwd 5.30% 25.240us 58.06% 276.375us 92.125us 10.912us 100.00% 14.592us 4.864us 3 -void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 10.912us 100.00% 10.912us 3.637us 3 - Activity Buffer Request 19.79% 94.192us 19.79% 94.192us 94.192us 3.680us 33.72% 3.680us 3.680us 1 - aten::empty_like 1.68% 7.980us 6.13% 29.160us 9.720us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 4.45% 21.180us 4.45% 21.180us 7.060us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 32.97% 156.943us 32.97% 156.943us 52.314us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 1.28% 6.090us 1.28% 6.090us 6.090us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 122.652us 1120.72% 122.652us 122.652us 1 + hf_kernels_causal_conv1d 12.91% 86.303us 99.27% 663.588us 663.588us 0.000us 0.00% 14.624us 14.624us 1 + CausalConv1dFn 10.74% 71.821us 86.36% 577.285us 192.428us 0.000us 0.00% 14.624us 4.875us 3 + _causal_conv1d_90f5a60::causal_conv1d_fwd 3.81% 25.480us 71.21% 476.023us 158.674us 10.944us 100.00% 14.624us 4.875us 3 +void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 10.944us 100.00% 10.944us 3.648us 3 + Activity Buffer Request 32.82% 219.426us 32.82% 219.426us 219.426us 3.680us 33.63% 3.680us 3.680us 1 + aten::empty_like 1.14% 7.591us 4.40% 29.441us 9.814us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 3.27% 21.850us 3.27% 21.850us 7.283us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 34.57% 231.117us 34.57% 231.117us 77.039us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.73% 4.900us 0.73% 4.900us 4.900us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 476.018us -Self CUDA time total: 10.912us +Self CPU time total: 668.488us +Self CUDA time total: 10.944us @@ -4384,19 +4386,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S512_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 129.375us 1178.71% 129.375us 129.375us 1 - hf_kernels_causal_conv1d 5.38% 99.351us 99.70% 1.840ms 1.840ms 0.000us 0.00% 14.656us 14.656us 1 - CausalConv1dFn 4.01% 73.942us 94.32% 1.740ms 580.087us 0.000us 0.00% 14.656us 4.885us 3 - _causal_conv1d_90f5a60::causal_conv1d_fwd 1.38% 25.552us 88.67% 1.636ms 545.346us 10.976us 100.00% 14.656us 4.885us 3 -void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 10.976us 100.00% 10.976us 3.659us 3 - Activity Buffer Request 78.64% 1.451ms 78.64% 1.451ms 1.451ms 3.680us 33.53% 3.680us 3.680us 1 - aten::empty_like 0.48% 8.800us 1.64% 30.280us 10.093us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 1.16% 21.480us 1.16% 21.480us 7.160us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 8.64% 159.392us 8.64% 159.392us 53.131us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.30% 5.531us 0.30% 5.531us 5.531us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 130.430us 1181.43% 130.430us 130.430us 1 + hf_kernels_causal_conv1d 4.23% 79.341us 99.73% 1.871ms 1.871ms 0.000us 0.00% 14.784us 14.784us 1 + CausalConv1dFn 4.03% 75.521us 95.50% 1.792ms 597.206us 0.000us 0.00% 14.784us 4.928us 3 + _causal_conv1d_90f5a60::causal_conv1d_fwd 1.43% 26.810us 89.82% 1.685ms 561.675us 11.040us 100.00% 14.784us 4.928us 3 +void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 11.040us 100.00% 11.040us 3.680us 3 + Activity Buffer Request 77.07% 1.446ms 77.07% 1.446ms 1.446ms 3.744us 33.91% 3.744us 3.744us 1 + aten::empty_like 0.44% 8.272us 1.66% 31.072us 10.357us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 1.22% 22.800us 1.22% 22.800us 7.600us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 11.32% 212.286us 11.32% 212.286us 70.762us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.27% 5.130us 0.27% 5.130us 5.130us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.845ms -Self CUDA time total: 10.976us +Self CPU time total: 1.876ms +Self CUDA time total: 11.040us @@ -4406,19 +4408,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S512_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 123.679us 1104.47% 123.679us 123.679us 1 - hf_kernels_causal_conv1d 17.75% 87.860us 98.92% 489.618us 489.618us 0.000us 0.00% 14.974us 14.974us 1 - CausalConv1dFn 14.77% 73.091us 81.17% 401.758us 133.919us 0.000us 0.00% 14.974us 4.991us 3 - _causal_conv1d_90f5a60::causal_conv1d_fwd 5.42% 26.830us 60.45% 299.195us 99.732us 11.198us 100.00% 14.974us 4.991us 3 -void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 11.198us 100.00% 11.198us 3.733us 3 - Activity Buffer Request 20.28% 100.392us 20.28% 100.392us 100.392us 3.776us 33.72% 3.776us 3.776us 1 - aten::empty_like 1.69% 8.381us 5.95% 29.472us 9.824us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 4.26% 21.091us 4.26% 21.091us 7.030us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 34.75% 171.973us 34.75% 171.973us 57.324us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 1.08% 5.331us 1.08% 5.331us 5.331us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 120.097us 1060.18% 120.097us 120.097us 1 + hf_kernels_causal_conv1d 13.35% 76.301us 99.17% 566.674us 566.674us 0.000us 0.00% 15.168us 15.168us 1 + CausalConv1dFn 12.80% 73.153us 85.81% 490.373us 163.458us 0.000us 0.00% 15.168us 5.056us 3 + _causal_conv1d_90f5a60::causal_conv1d_fwd 4.71% 26.911us 68.00% 388.569us 129.523us 11.328us 100.00% 15.168us 5.056us 3 +void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 11.328us 100.00% 11.328us 3.776us 3 + Activity Buffer Request 34.49% 197.075us 34.49% 197.075us 197.075us 3.840us 33.90% 3.840us 3.840us 1 + aten::empty_like 1.29% 7.379us 5.01% 28.651us 9.550us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 3.72% 21.272us 3.72% 21.272us 7.091us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 28.80% 164.583us 28.80% 164.583us 54.861us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.83% 4.760us 0.83% 4.760us 4.760us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 494.949us -Self CUDA time total: 11.198us +Self CPU time total: 571.434us +Self CUDA time total: 11.328us @@ -4428,19 +4430,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S2048_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 132.959us 264.31% 132.959us 132.959us 1 - hf_kernels_causal_conv1d 5.33% 97.801us 99.71% 1.830ms 1.830ms 0.000us 0.00% 83.968us 83.968us 1 - CausalConv1dFn 4.03% 73.903us 94.38% 1.732ms 577.264us 0.000us 0.00% 83.968us 27.989us 3 - _causal_conv1d_90f5a60::causal_conv1d_fwd 1.44% 26.339us 88.71% 1.628ms 542.606us 50.304us 100.00% 83.968us 27.989us 3 -void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 50.304us 100.00% 50.304us 16.768us 3 - Activity Buffer Request 78.52% 1.441ms 78.52% 1.441ms 1.441ms 33.664us 66.92% 33.664us 33.664us 1 - aten::empty_like 0.46% 8.510us 1.64% 30.070us 10.023us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 1.17% 21.560us 1.17% 21.560us 7.187us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 8.75% 160.594us 8.75% 160.594us 53.531us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.29% 5.400us 0.29% 5.400us 5.400us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 133.919us 265.71% 133.919us 133.919us 1 + hf_kernels_causal_conv1d 4.38% 80.552us 99.73% 1.836ms 1.836ms 0.000us 0.00% 83.873us 83.873us 1 + CausalConv1dFn 4.09% 75.353us 95.35% 1.755ms 585.145us 0.000us 0.00% 83.873us 27.958us 3 + _causal_conv1d_90f5a60::causal_conv1d_fwd 1.33% 24.410us 89.50% 1.648ms 549.264us 50.401us 100.00% 83.873us 27.958us 3 +void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 50.401us 100.00% 50.401us 16.800us 3 + Activity Buffer Request 79.01% 1.455ms 79.01% 1.455ms 1.455ms 33.472us 66.41% 33.472us 33.472us 1 + aten::empty_like 0.45% 8.369us 1.75% 32.290us 10.763us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 1.30% 23.921us 1.30% 23.921us 7.974us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 9.17% 168.764us 9.17% 168.764us 56.255us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.27% 5.020us 0.27% 5.020us 5.020us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.835ms -Self CUDA time total: 50.304us +Self CPU time total: 1.841ms +Self CUDA time total: 50.401us @@ -4450,18 +4452,18 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S2048_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 125.085us 244.46% 125.085us 125.085us 1 - hf_kernels_causal_conv1d 15.91% 74.080us 98.78% 459.898us 459.898us 0.000us 0.00% 85.694us 85.694us 1 - CausalConv1dFn 15.58% 72.521us 82.87% 385.818us 128.606us 0.000us 0.00% 85.694us 28.565us 3 - _causal_conv1d_90f5a60::causal_conv1d_fwd 5.92% 27.572us 61.05% 284.236us 94.745us 51.167us 100.00% 85.694us 28.565us 3 + hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 131.005us 256.03% 131.005us 131.005us 1 + hf_kernels_causal_conv1d 11.69% 77.241us 99.25% 655.717us 655.717us 0.000us 0.00% 85.534us 85.534us 1 + CausalConv1dFn 10.97% 72.503us 87.56% 578.476us 192.825us 0.000us 0.00% 85.534us 28.511us 3 + _causal_conv1d_90f5a60::causal_conv1d_fwd 3.89% 25.692us 71.76% 474.103us 158.034us 51.167us 100.00% 85.534us 28.511us 3 void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 51.167us 100.00% 51.167us 17.056us 3 - Activity Buffer Request 21.78% 101.412us 21.78% 101.412us 101.412us 34.527us 67.48% 34.527us 34.527us 1 - aten::empty_like 1.68% 7.830us 6.24% 29.061us 9.687us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 4.56% 21.231us 4.56% 21.231us 7.077us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 33.35% 155.252us 33.35% 155.252us 51.751us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 1.22% 5.680us 1.22% 5.680us 5.680us 0.000us 0.00% 0.000us 0.000us 1 + Activity Buffer Request 43.08% 284.587us 43.08% 284.587us 284.587us 34.367us 67.17% 34.367us 34.367us 1 + aten::empty_like 1.14% 7.549us 4.82% 31.870us 10.623us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 3.68% 24.321us 3.68% 24.321us 8.107us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 24.80% 163.824us 24.80% 163.824us 54.608us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.75% 4.929us 0.75% 4.929us 4.929us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 465.578us +Self CPU time total: 660.646us Self CUDA time total: 51.167us @@ -4472,19 +4474,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S128_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 123.583us 3164.74% 123.583us 123.583us 1 - hf_kernels_causal_conv1d 8.70% 75.560us 99.36% 863.215us 863.215us 0.000us 0.00% 5.153us 5.153us 1 - CausalConv1dFn 8.33% 72.353us 90.66% 787.655us 262.552us 0.000us 0.00% 5.153us 1.718us 3 - _causal_conv1d_90f5a60::causal_conv1d_fwd 2.88% 25.000us 78.85% 685.062us 228.354us 3.905us 100.00% 5.153us 1.718us 3 -void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 3.905us 100.00% 3.905us 1.302us 3 - Activity Buffer Request 57.61% 500.499us 57.61% 500.499us 500.499us 1.248us 31.96% 1.248us 1.248us 1 - aten::empty_like 0.96% 8.370us 3.48% 30.240us 10.080us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 2.52% 21.870us 2.52% 21.870us 7.290us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 18.37% 159.563us 18.37% 159.563us 53.188us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.64% 5.560us 0.64% 5.560us 5.560us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 118.686us 3040.89% 118.686us 118.686us 1 + hf_kernels_causal_conv1d 11.60% 73.750us 99.24% 631.216us 631.216us 0.000us 0.00% 5.183us 5.183us 1 + CausalConv1dFn 11.30% 71.845us 87.65% 557.466us 185.822us 0.000us 0.00% 5.183us 1.728us 3 + _causal_conv1d_90f5a60::causal_conv1d_fwd 4.22% 26.861us 71.87% 457.101us 152.367us 3.903us 100.00% 5.183us 1.728us 3 +void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 3.903us 100.00% 3.903us 1.301us 3 + Activity Buffer Request 42.38% 269.577us 42.38% 269.577us 269.577us 1.280us 32.80% 1.280us 1.280us 1 + aten::empty_like 1.23% 7.810us 4.48% 28.520us 9.507us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 3.26% 20.710us 3.26% 20.710us 6.903us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 25.26% 160.663us 25.26% 160.663us 53.554us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.76% 4.821us 0.76% 4.821us 4.821us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 868.775us -Self CUDA time total: 3.905us +Self CPU time total: 636.037us +Self CUDA time total: 3.903us @@ -4494,19 +4496,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S128_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 118.845us 3044.19% 118.845us 118.845us 1 - hf_kernels_causal_conv1d 16.55% 74.260us 98.76% 443.077us 443.077us 0.000us 0.00% 5.152us 5.152us 1 - CausalConv1dFn 15.87% 71.182us 82.21% 368.817us 122.939us 0.000us 0.00% 5.152us 1.717us 3 - _causal_conv1d_90f5a60::causal_conv1d_fwd 5.48% 24.591us 59.34% 266.204us 88.735us 3.904us 100.00% 5.152us 1.717us 3 -void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 3.904us 100.00% 3.904us 1.301us 3 - Activity Buffer Request 18.72% 83.961us 18.72% 83.961us 83.961us 1.248us 31.97% 1.248us 1.248us 1 - aten::empty_like 1.83% 8.189us 7.01% 31.431us 10.477us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 5.18% 23.242us 5.18% 23.242us 7.747us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 35.14% 157.652us 35.14% 157.652us 52.551us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 1.24% 5.551us 1.24% 5.551us 5.551us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 120.221us 3029.76% 120.221us 120.221us 1 + hf_kernels_causal_conv1d 13.01% 75.082us 99.09% 571.775us 571.775us 0.000us 0.00% 5.248us 5.248us 1 + CausalConv1dFn 12.35% 71.241us 86.08% 496.693us 165.564us 0.000us 0.00% 5.248us 1.749us 3 + _causal_conv1d_90f5a60::causal_conv1d_fwd 4.88% 28.181us 68.58% 395.720us 131.907us 3.968us 100.00% 5.248us 1.749us 3 +void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 3.968us 100.00% 3.968us 1.323us 3 + Activity Buffer Request 36.26% 209.246us 36.26% 209.246us 209.246us 1.280us 32.26% 1.280us 1.280us 1 + aten::empty_like 1.42% 8.172us 5.15% 29.732us 9.911us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 3.74% 21.560us 3.74% 21.560us 7.187us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 27.43% 158.293us 27.43% 158.293us 52.764us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.91% 5.270us 0.91% 5.270us 5.270us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 448.628us -Self CUDA time total: 3.904us +Self CPU time total: 577.045us +Self CUDA time total: 3.968us @@ -4516,19 +4518,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S512_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 122.816us 3046.03% 122.816us 122.816us 1 - hf_kernels_causal_conv1d 8.66% 75.390us 99.38% 865.505us 865.505us 0.000us 0.00% 5.376us 5.376us 1 - CausalConv1dFn 8.40% 73.201us 90.72% 790.115us 263.372us 0.000us 0.00% 5.376us 1.792us 3 - _causal_conv1d_90f5a60::causal_conv1d_fwd 3.02% 26.261us 78.90% 687.193us 229.064us 4.032us 100.00% 5.376us 1.792us 3 -void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 4.032us 100.00% 4.032us 1.344us 3 - Activity Buffer Request 57.07% 497.089us 57.07% 497.089us 497.089us 1.344us 33.33% 1.344us 1.344us 1 - aten::empty_like 0.93% 8.130us 3.41% 29.721us 9.907us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 2.48% 21.591us 2.48% 21.591us 7.197us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 18.81% 163.843us 18.81% 163.843us 54.614us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.62% 5.440us 0.62% 5.440us 5.440us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 117.374us 2843.36% 117.374us 117.374us 1 + hf_kernels_causal_conv1d 14.38% 74.792us 98.97% 514.843us 514.843us 0.000us 0.00% 5.504us 5.504us 1 + CausalConv1dFn 13.25% 68.940us 84.59% 440.051us 146.684us 0.000us 0.00% 5.504us 1.835us 3 + _causal_conv1d_90f5a60::causal_conv1d_fwd 4.99% 25.981us 65.51% 340.779us 113.593us 4.128us 100.00% 5.504us 1.835us 3 +void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 4.128us 100.00% 4.128us 1.376us 3 + Activity Buffer Request 29.84% 155.214us 29.84% 155.214us 155.214us 1.376us 33.33% 1.376us 1.376us 1 + aten::empty_like 1.55% 8.080us 5.83% 30.332us 10.111us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 4.28% 22.252us 4.28% 22.252us 7.417us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 30.68% 159.584us 30.68% 159.584us 53.195us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 1.03% 5.380us 1.03% 5.380us 5.380us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 870.945us -Self CUDA time total: 4.032us +Self CPU time total: 520.223us +Self CUDA time total: 4.128us @@ -4538,18 +4540,18 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S512_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 116.446us 2866.01% 116.446us 116.446us 1 - hf_kernels_causal_conv1d 16.24% 74.671us 98.84% 454.378us 454.378us 0.000us 0.00% 5.407us 5.407us 1 - CausalConv1dFn 15.28% 70.221us 82.60% 379.707us 126.569us 0.000us 0.00% 5.407us 1.802us 3 - _causal_conv1d_90f5a60::causal_conv1d_fwd 5.99% 27.540us 61.00% 280.405us 93.468us 4.063us 100.00% 5.407us 1.802us 3 + hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 116.831us 2875.49% 116.831us 116.831us 1 + hf_kernels_causal_conv1d 13.78% 75.282us 99.09% 541.484us 541.484us 0.000us 0.00% 5.439us 5.439us 1 + CausalConv1dFn 12.58% 68.741us 85.32% 466.202us 155.401us 0.000us 0.00% 5.439us 1.813us 3 + _causal_conv1d_90f5a60::causal_conv1d_fwd 4.76% 26.021us 67.34% 367.980us 122.660us 4.063us 100.00% 5.439us 1.813us 3 void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 4.063us 100.00% 4.063us 1.354us 3 - Activity Buffer Request 21.14% 97.192us 21.14% 97.192us 97.192us 1.344us 33.08% 1.344us 1.344us 1 - aten::empty_like 1.73% 7.931us 6.33% 29.081us 9.694us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 4.60% 21.150us 4.60% 21.150us 7.050us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 33.86% 155.673us 33.86% 155.673us 51.891us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 1.16% 5.330us 1.16% 5.330us 5.330us 0.000us 0.00% 0.000us 0.000us 1 + Activity Buffer Request 33.52% 183.175us 33.52% 183.175us 183.175us 1.376us 33.87% 1.376us 1.376us 1 + aten::empty_like 1.37% 7.489us 5.40% 29.481us 9.827us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 4.02% 21.992us 4.02% 21.992us 7.331us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 29.06% 158.784us 29.06% 158.784us 52.928us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.91% 4.951us 0.91% 4.951us 4.951us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 459.708us +Self CPU time total: 546.435us Self CUDA time total: 4.063us @@ -4560,19 +4562,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S2048_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 120.895us 2262.26% 120.895us 120.895us 1 - hf_kernels_causal_conv1d 10.03% 75.040us 99.26% 742.432us 742.432us 0.000us 0.00% 7.136us 7.136us 1 - CausalConv1dFn 9.57% 71.601us 89.23% 667.392us 222.464us 0.000us 0.00% 7.136us 2.379us 3 - _causal_conv1d_90f5a60::causal_conv1d_fwd 3.57% 26.722us 75.60% 565.480us 188.493us 5.344us 100.00% 7.136us 2.379us 3 -void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 5.344us 100.00% 5.344us 1.781us 3 - Activity Buffer Request 50.95% 381.056us 50.95% 381.056us 381.056us 1.792us 33.53% 1.792us 1.792us 1 - aten::empty_like 1.09% 8.161us 4.05% 30.311us 10.104us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 2.96% 22.150us 2.96% 22.150us 7.383us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 21.08% 157.702us 21.08% 157.702us 52.567us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.74% 5.510us 0.74% 5.510us 5.510us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 119.806us 2228.53% 119.806us 119.806us 1 + hf_kernels_causal_conv1d 11.93% 76.073us 99.21% 632.507us 632.507us 0.000us 0.00% 7.200us 7.200us 1 + CausalConv1dFn 11.21% 71.480us 87.28% 556.434us 185.478us 0.000us 0.00% 7.200us 2.400us 3 + _causal_conv1d_90f5a60::causal_conv1d_fwd 4.13% 26.361us 71.46% 455.612us 151.871us 5.376us 100.00% 7.200us 2.400us 3 +void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 5.376us 100.00% 5.376us 1.792us 3 + Activity Buffer Request 42.49% 270.867us 42.49% 270.867us 270.867us 1.824us 33.93% 1.824us 1.824us 1 + aten::empty_like 1.24% 7.892us 4.60% 29.342us 9.781us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 3.36% 21.450us 3.36% 21.450us 7.150us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 24.84% 158.384us 24.84% 158.384us 52.795us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.79% 5.050us 0.79% 5.050us 5.050us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 747.942us -Self CUDA time total: 5.344us +Self CPU time total: 637.557us +Self CUDA time total: 5.376us @@ -4582,19 +4584,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S2048_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 114.428us 2091.54% 114.428us 114.428us 1 - hf_kernels_causal_conv1d 15.93% 72.612us 98.81% 450.477us 450.477us 0.000us 0.00% 7.327us 7.327us 1 - CausalConv1dFn 15.28% 69.671us 82.88% 377.865us 125.955us 0.000us 0.00% 7.327us 2.442us 3 - _causal_conv1d_90f5a60::causal_conv1d_fwd 5.81% 26.480us 61.42% 279.994us 93.331us 5.471us 100.00% 7.327us 2.442us 3 -void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 5.471us 100.00% 5.471us 1.824us 3 - Activity Buffer Request 21.45% 97.772us 21.45% 97.772us 97.772us 1.856us 33.92% 1.856us 1.856us 1 - aten::empty_like 1.75% 7.980us 6.19% 28.200us 9.400us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 4.44% 20.220us 4.44% 20.220us 6.740us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 34.16% 155.742us 34.16% 155.742us 51.914us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 1.19% 5.420us 1.19% 5.420us 5.420us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 119.676us 2174.35% 119.676us 119.676us 1 + hf_kernels_causal_conv1d 14.25% 74.352us 99.01% 516.513us 516.513us 0.000us 0.00% 7.392us 7.392us 1 + CausalConv1dFn 14.02% 73.122us 84.76% 442.161us 147.387us 0.000us 0.00% 7.392us 2.464us 3 + _causal_conv1d_90f5a60::causal_conv1d_fwd 5.04% 26.281us 65.18% 340.038us 113.346us 5.504us 100.00% 7.392us 2.464us 3 +void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 5.504us 100.00% 5.504us 1.835us 3 + Activity Buffer Request 30.19% 157.524us 30.19% 157.524us 157.524us 1.888us 34.30% 1.888us 1.888us 1 + aten::empty_like 1.50% 7.800us 5.56% 29.001us 9.667us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 4.06% 21.201us 4.06% 21.201us 7.067us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 29.95% 156.233us 29.95% 156.233us 52.078us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.99% 5.180us 0.99% 5.180us 5.180us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 455.897us -Self CUDA time total: 5.471us +Self CPU time total: 521.693us +Self CUDA time total: 5.504us @@ -4604,19 +4606,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S128_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 124.251us 717.80% 124.251us 124.251us 1 - hf_kernels_causal_conv1d 10.05% 75.520us 99.24% 745.563us 745.563us 0.000us 0.00% 23.101us 23.101us 1 - CausalConv1dFn 9.33% 70.111us 89.19% 670.043us 223.348us 0.000us 0.00% 23.101us 7.700us 3 - _causal_conv1d_90f5a60::causal_conv1d_fwd 3.43% 25.770us 75.92% 570.342us 190.114us 17.310us 100.00% 23.101us 7.700us 3 -void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 17.310us 100.00% 17.310us 5.770us 3 - Activity Buffer Request 51.18% 384.497us 51.18% 384.497us 384.497us 5.791us 33.45% 5.791us 5.791us 1 - aten::empty_like 1.14% 8.540us 3.94% 29.590us 9.863us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 2.80% 21.050us 2.80% 21.050us 7.017us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 21.31% 160.075us 21.31% 160.075us 53.358us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.76% 5.680us 0.76% 5.680us 5.680us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 124.798us 715.63% 124.798us 124.798us 1 + hf_kernels_causal_conv1d 11.85% 75.293us 99.15% 630.167us 630.167us 0.000us 0.00% 23.295us 23.295us 1 + CausalConv1dFn 11.06% 70.310us 87.30% 554.874us 184.958us 0.000us 0.00% 23.295us 7.765us 3 + _causal_conv1d_90f5a60::causal_conv1d_fwd 4.18% 26.540us 71.39% 453.732us 151.244us 17.439us 100.00% 23.295us 7.765us 3 +void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 17.439us 100.00% 17.439us 5.813us 3 + Activity Buffer Request 42.20% 268.237us 42.20% 268.237us 268.237us 5.856us 33.58% 5.856us 5.856us 1 + aten::empty_like 1.25% 7.951us 4.85% 30.832us 10.277us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 3.60% 22.881us 3.60% 22.881us 7.627us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 25.01% 158.955us 25.01% 158.955us 52.985us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.85% 5.410us 0.85% 5.410us 5.410us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 751.243us -Self CUDA time total: 17.310us +Self CPU time total: 635.577us +Self CUDA time total: 17.439us @@ -4626,19 +4628,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S128_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 121.596us 682.20% 121.596us 121.596us 1 - hf_kernels_causal_conv1d 16.81% 75.551us 98.76% 443.797us 443.797us 0.000us 0.00% 23.808us 23.808us 1 - CausalConv1dFn 15.22% 68.400us 81.95% 368.246us 122.749us 0.000us 0.00% 23.808us 7.936us 3 - _causal_conv1d_90f5a60::causal_conv1d_fwd 5.83% 26.181us 60.07% 269.934us 89.978us 17.824us 100.00% 23.808us 7.936us 3 -void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 17.824us 100.00% 17.824us 5.941us 3 - Activity Buffer Request 19.24% 86.441us 19.24% 86.441us 86.441us 5.984us 33.57% 5.984us 5.984us 1 - aten::empty_like 1.76% 7.900us 6.66% 29.912us 9.971us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 4.90% 22.012us 4.90% 22.012us 7.337us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 35.01% 157.312us 35.01% 157.312us 52.437us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 1.24% 5.550us 1.24% 5.550us 5.550us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 124.252us 695.89% 124.252us 124.252us 1 + hf_kernels_causal_conv1d 15.28% 76.213us 99.04% 494.053us 494.053us 0.000us 0.00% 23.839us 23.839us 1 + CausalConv1dFn 14.60% 72.841us 83.76% 417.840us 139.280us 0.000us 0.00% 23.839us 7.946us 3 + _causal_conv1d_90f5a60::causal_conv1d_fwd 5.38% 26.851us 63.27% 315.607us 105.202us 17.855us 100.00% 23.839us 7.946us 3 +void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 17.855us 100.00% 17.855us 5.952us 3 + Activity Buffer Request 26.40% 131.703us 26.40% 131.703us 131.703us 5.984us 33.51% 5.984us 5.984us 1 + aten::empty_like 1.62% 8.090us 5.89% 29.392us 9.797us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 4.27% 21.302us 4.27% 21.302us 7.101us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 31.48% 157.053us 31.48% 157.053us 52.351us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.96% 4.810us 0.96% 4.810us 4.810us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 449.347us -Self CUDA time total: 17.824us +Self CPU time total: 498.863us +Self CUDA time total: 17.855us @@ -4648,19 +4650,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S512_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 122.077us 686.13% 122.077us 122.077us 1 - hf_kernels_causal_conv1d 12.00% 91.181us 99.29% 754.243us 754.243us 0.000us 0.00% 23.808us 23.808us 1 - CausalConv1dFn 9.45% 71.802us 87.29% 663.062us 221.021us 0.000us 0.00% 23.808us 7.936us 3 - _causal_conv1d_90f5a60::causal_conv1d_fwd 3.27% 24.831us 73.88% 561.180us 187.060us 17.792us 100.00% 23.808us 7.936us 3 -void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 17.792us 100.00% 17.792us 5.931us 3 - Activity Buffer Request 49.89% 378.947us 49.89% 378.947us 378.947us 6.016us 33.81% 6.016us 6.016us 1 - aten::empty_like 1.06% 8.020us 3.96% 30.080us 10.027us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 2.90% 22.060us 2.90% 22.060us 7.353us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 20.72% 157.402us 20.72% 157.402us 52.467us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.71% 5.381us 0.71% 5.381us 5.381us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 124.253us 695.94% 124.253us 124.253us 1 + hf_kernels_causal_conv1d 14.09% 92.581us 99.22% 652.096us 652.096us 0.000us 0.00% 23.838us 23.838us 1 + CausalConv1dFn 11.45% 75.254us 85.13% 559.515us 186.505us 0.000us 0.00% 23.838us 7.946us 3 + _causal_conv1d_90f5a60::causal_conv1d_fwd 3.84% 25.251us 69.30% 455.481us 151.827us 17.854us 100.00% 23.838us 7.946us 3 +void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 17.854us 100.00% 17.854us 5.951us 3 + Activity Buffer Request 41.42% 272.247us 41.42% 272.247us 272.247us 5.984us 33.52% 5.984us 5.984us 1 + aten::empty_like 1.19% 7.849us 4.38% 28.780us 9.593us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 3.18% 20.931us 3.18% 20.931us 6.977us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 24.04% 157.983us 24.04% 157.983us 52.661us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.78% 5.140us 0.78% 5.140us 5.140us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 759.624us -Self CUDA time total: 17.792us +Self CPU time total: 657.236us +Self CUDA time total: 17.854us @@ -4670,19 +4672,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S512_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 124.351us 671.15% 124.351us 124.351us 1 - hf_kernels_causal_conv1d 19.13% 92.321us 98.80% 476.748us 476.748us 0.000us 0.00% 24.736us 24.736us 1 - CausalConv1dFn 14.83% 71.551us 79.67% 384.427us 128.142us 0.000us 0.00% 24.736us 8.245us 3 - _causal_conv1d_90f5a60::causal_conv1d_fwd 5.89% 28.409us 58.58% 282.676us 94.225us 18.528us 100.00% 24.736us 8.245us 3 -void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 18.528us 100.00% 18.528us 6.176us 3 - Activity Buffer Request 20.26% 97.782us 20.26% 97.782us 97.782us 6.208us 33.51% 6.208us 6.208us 1 - aten::empty_like 1.73% 8.360us 6.26% 30.200us 10.067us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 4.53% 21.840us 4.53% 21.840us 7.280us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 32.43% 156.485us 32.43% 156.485us 52.162us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 1.20% 5.770us 1.20% 5.770us 5.770us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 121.982us 651.61% 121.982us 121.982us 1 + hf_kernels_causal_conv1d 16.26% 76.273us 99.00% 464.343us 464.343us 0.000us 0.00% 25.088us 25.088us 1 + CausalConv1dFn 15.20% 71.302us 82.74% 388.070us 129.357us 0.000us 0.00% 25.088us 8.363us 3 + _causal_conv1d_90f5a60::causal_conv1d_fwd 5.49% 25.750us 61.15% 286.808us 95.603us 18.720us 100.00% 25.088us 8.363us 3 +void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 18.720us 100.00% 18.720us 6.240us 3 + Activity Buffer Request 22.13% 103.813us 22.13% 103.813us 103.813us 6.368us 34.02% 6.368us 6.368us 1 + aten::empty_like 1.75% 8.210us 6.39% 29.960us 9.987us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 4.64% 21.750us 4.64% 21.750us 7.250us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 33.53% 157.245us 33.53% 157.245us 52.415us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 1.00% 4.680us 1.00% 4.680us 4.680us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 482.518us -Self CUDA time total: 18.528us +Self CPU time total: 469.023us +Self CUDA time total: 18.720us @@ -4692,19 +4694,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S2048_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_causal_conv1d 5.47% 101.271us 99.69% 1.845ms 1.845ms 0.000us 0.00% 162.913us 162.913us 1 - CausalConv1dFn 4.05% 75.021us 94.22% 1.743ms 581.104us 0.000us 0.00% 162.913us 54.304us 3 - _causal_conv1d_90f5a60::causal_conv1d_fwd 1.32% 24.372us 88.46% 1.637ms 545.603us 97.697us 100.00% 162.913us 54.304us 3 - hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 139.807us 143.10% 139.807us 139.807us 1 -void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 97.697us 100.00% 97.697us 32.566us 3 - Activity Buffer Request 78.43% 1.451ms 78.43% 1.451ms 1.451ms 65.216us 66.75% 65.216us 65.216us 1 - aten::empty_like 0.45% 8.320us 1.70% 31.480us 10.493us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 1.25% 23.160us 1.25% 23.160us 7.720us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 8.71% 161.192us 8.71% 161.192us 53.731us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 0.31% 5.721us 0.31% 5.721us 5.721us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_causal_conv1d 4.40% 80.973us 99.73% 1.837ms 1.837ms 0.000us 0.00% 162.749us 162.749us 1 + CausalConv1dFn 4.14% 76.301us 95.33% 1.756ms 585.285us 0.000us 0.00% 162.749us 54.250us 3 + _causal_conv1d_90f5a60::causal_conv1d_fwd 1.45% 26.730us 89.50% 1.648ms 549.474us 97.918us 100.00% 162.749us 54.250us 3 + hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 141.950us 144.97% 141.950us 141.950us 1 +void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 97.918us 100.00% 97.918us 32.639us 3 + Activity Buffer Request 78.99% 1.455ms 78.99% 1.455ms 1.455ms 64.831us 66.21% 64.831us 64.831us 1 + aten::empty_like 0.45% 8.340us 1.69% 31.131us 10.377us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 1.24% 22.791us 1.24% 22.791us 7.597us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 9.06% 166.885us 9.06% 166.885us 55.628us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 0.27% 4.980us 0.27% 4.980us 4.980us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.850ms -Self CUDA time total: 97.697us +Self CPU time total: 1.842ms +Self CUDA time total: 97.918us @@ -4714,19 +4716,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S2048_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_causal_conv1d 19.60% 95.701us 98.90% 482.848us 482.848us 0.000us 0.00% 163.744us 163.744us 1 - CausalConv1dFn 15.21% 74.281us 79.29% 387.147us 129.049us 0.000us 0.00% 163.744us 54.581us 3 - _causal_conv1d_90f5a60::causal_conv1d_fwd 5.67% 27.701us 57.93% 282.846us 94.282us 98.688us 100.00% 163.744us 54.581us 3 - hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 139.968us 141.83% 139.968us 139.968us 1 -void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 98.688us 100.00% 98.688us 32.896us 3 - Activity Buffer Request 19.94% 97.362us 19.94% 97.362us 97.362us 65.056us 65.92% 65.056us 65.056us 1 - aten::empty_like 1.68% 8.190us 6.15% 30.020us 10.007us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 4.47% 21.830us 4.47% 21.830us 7.277us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 32.32% 157.783us 32.32% 157.783us 52.594us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 1.10% 5.391us 1.10% 5.391us 5.391us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_causal_conv1d 16.07% 76.871us 98.94% 473.172us 473.172us 0.000us 0.00% 163.803us 163.803us 1 + CausalConv1dFn 14.96% 71.532us 82.87% 396.301us 132.100us 0.000us 0.00% 163.803us 54.601us 3 + _causal_conv1d_90f5a60::causal_conv1d_fwd 5.75% 27.501us 61.56% 294.418us 98.139us 98.685us 100.00% 163.803us 54.601us 3 + hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 133.180us 134.95% 133.180us 133.180us 1 +void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 98.685us 100.00% 98.685us 32.895us 3 + Activity Buffer Request 21.65% 103.543us 21.65% 103.543us 103.543us 65.118us 65.99% 65.118us 65.118us 1 + aten::empty_like 1.52% 7.251us 6.35% 30.351us 10.117us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 4.83% 23.100us 4.83% 23.100us 7.700us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 34.16% 163.374us 34.16% 163.374us 54.458us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 1.06% 5.061us 1.06% 5.061us 5.061us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 488.239us -Self CUDA time total: 98.688us +Self CPU time total: 478.233us +Self CUDA time total: 98.685us impl wl p50(ms) ok @@ -4758,13 +4760,13 @@ hf_kernels_causal_conv1d cuda_B4_D64_S512_W4 0.05 True
▶ UV Install Logs
Fetching 11 files: 0%| | 0/11 [00:00<?, ?it/s] -Fetching 11 files: 9%|▉ | 1/11 [00:00<00:01, 6.41it/s] -Fetching 11 files: 64%|██████▎ | 7/11 [00:01<00:00, 4.26it/s] -Fetching 11 files: 100%|██████████| 11/11 [00:01<00:00, 6.78it/s]
+Fetching 11 files: 9%|▉ | 1/11 [00:00<00:01, 9.42it/s] +Fetching 11 files: 64%|██████▎ | 7/11 [00:01<00:00, 4.98it/s] +Fetching 11 files: 100%|██████████| 11/11 [00:01<00:00, 7.98it/s]

Artifacts:

causal_conv1d.jsonl diff --git a/causal_conv1d/impls/torch_causal_conv1d.html b/causal_conv1d/impls/torch_causal_conv1d.html index afa77a96b2a763e56f05f85b5cc1eef91c17fd17..6358d2b943cf22bb9f31aeb2e669932f13397132 100644 --- a/causal_conv1d/impls/torch_causal_conv1d.html +++ b/causal_conv1d/impls/torch_causal_conv1d.html @@ -4106,11 +4106,11 @@ body[data-tool="eraser"] .main-content { ▼ output ▶ uv-logs | -Cell: nv | 0.28s +Cell: nv | 0.21s | Raw -GitHub +GitHub
@@ -4122,7 +4122,7 @@ Cell: nv | 0.28s
-
Thu Oct 30 15:51:43 2025       
+
Fri Oct 31 20:00:25 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -4131,7 +4131,7 @@ Cell: nv | 0.28s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   27C    P8             22W /  350W |       0MiB /  46068MiB |      0%      Default |
+| N/A   33C    P0             79W /  350W |       0MiB /  46068MiB |     11%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -4153,13 +4153,13 @@ Cell: nv | 0.28s
 
 ▼ code 
 ▼ output
- ▶ uv-logs
+ ▶ uv-logs
  | 
-Cell: benchmark | 32.46s
+Cell: benchmark | 3.68s
  | 
 
 Raw
-GitHub
+GitHub
 
@@ -4217,29 +4217,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 467.230us 2421.38% 467.230us 467.230us 1 - torch_eager 10.72% 231.062us 99.69% 2.148ms 2.148ms 0.000us 0.00% 21.632us 21.632us 1 - aten::to 0.58% 12.480us 78.88% 1.700ms 283.277us 0.000us 0.00% 14.336us 2.389us 6 - aten::_to_copy 2.05% 44.092us 78.31% 1.687ms 281.197us 0.000us 0.00% 14.336us 2.389us 6 - aten::copy_ 3.07% 66.050us 73.46% 1.583ms 263.783us 12.000us 62.19% 14.336us 2.389us 6 - aten::conv1d 0.49% 10.600us 7.90% 170.164us 56.721us 0.000us 0.00% 7.296us 2.432us 3 - aten::convolution 0.77% 16.490us 7.41% 159.564us 53.188us 0.000us 0.00% 7.296us 2.432us 3 - aten::_convolution 1.64% 35.301us 6.64% 143.074us 47.691us 0.000us 0.00% 7.296us 2.432us 3 - aten::_conv_depthwise2d 1.69% 36.381us 4.00% 86.271us 28.757us 7.296us 37.81% 7.296us 2.432us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.296us 37.81% 7.296us 2.432us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.304us 32.67% 6.304us 2.101us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.696us 29.52% 5.696us 1.899us 3 - Activity Buffer Request 66.85% 1.440ms 66.85% 1.440ms 1.440ms 2.336us 12.11% 2.336us 2.336us 1 - aten::empty_strided 2.80% 60.390us 2.80% 60.390us 10.065us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 4.73% 101.823us 4.73% 101.823us 11.314us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 1.46% 31.451us 1.84% 39.731us 4.415us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.62% 13.289us 0.62% 13.289us 0.886us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.58% 12.560us 0.58% 12.560us 4.187us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.54% 11.740us 0.54% 11.740us 3.913us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.42% 8.963us 0.49% 10.602us 3.534us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 439.324us 2269.12% 439.324us 439.324us 1 + torch_eager 10.31% 220.478us 99.69% 2.131ms 2.131ms 0.000us 0.00% 21.729us 21.729us 1 + aten::to 0.50% 10.770us 79.87% 1.707ms 284.530us 0.000us 0.00% 14.369us 2.395us 6 + aten::_to_copy 1.71% 36.499us 79.36% 1.696ms 282.735us 0.000us 0.00% 14.369us 2.395us 6 + aten::copy_ 2.77% 59.234us 75.21% 1.608ms 267.930us 12.001us 61.99% 14.369us 2.395us 6 + aten::conv1d 0.36% 7.590us 7.34% 156.883us 52.294us 0.000us 0.00% 7.360us 2.453us 3 + aten::convolution 0.66% 14.070us 6.98% 149.293us 49.764us 0.000us 0.00% 7.360us 2.453us 3 + aten::_convolution 1.51% 32.210us 6.33% 135.223us 45.074us 0.000us 0.00% 7.360us 2.453us 3 + aten::_conv_depthwise2d 1.61% 34.371us 4.00% 85.463us 28.488us 7.360us 38.01% 7.360us 2.453us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.360us 38.01% 7.360us 2.453us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.337us 32.73% 6.337us 2.112us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.664us 29.25% 5.664us 1.888us 3 + Activity Buffer Request 69.37% 1.483ms 69.37% 1.483ms 1.483ms 2.368us 12.23% 2.368us 2.368us 1 + aten::empty_strided 2.45% 52.331us 2.45% 52.331us 8.722us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 4.26% 91.032us 4.26% 91.032us 10.115us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 1.32% 28.311us 1.71% 36.491us 4.055us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.64% 13.700us 0.64% 13.700us 0.913us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.60% 12.790us 0.60% 12.790us 4.263us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.59% 12.710us 0.59% 12.710us 4.237us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.31% 6.640us 0.38% 8.090us 2.697us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.155ms -Self CUDA time total: 19.296us +Self CPU time total: 2.138ms +Self CUDA time total: 19.361us @@ -4249,29 +4249,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 337.566us 1726.42% 337.566us 337.566us 1 - torch_eager 6.86% 130.161us 99.69% 1.893ms 1.893ms 0.000us 0.00% 21.665us 21.665us 1 - aten::to 0.32% 6.060us 85.13% 1.616ms 269.375us 0.000us 0.00% 13.729us 2.288us 6 - aten::_to_copy 1.27% 24.100us 84.81% 1.610ms 268.365us 0.000us 0.00% 13.729us 2.288us 6 - aten::copy_ 2.69% 51.011us 81.95% 1.556ms 259.305us 11.617us 59.41% 13.729us 2.288us 6 - aten::conv1d 0.30% 5.740us 6.23% 118.253us 39.418us 0.000us 0.00% 7.936us 2.645us 3 - aten::convolution 0.52% 9.902us 5.93% 112.513us 37.504us 0.000us 0.00% 7.936us 2.645us 3 - aten::_convolution 1.21% 22.959us 5.40% 102.611us 34.204us 0.000us 0.00% 7.936us 2.645us 3 - aten::_conv_depthwise2d 1.18% 22.461us 3.33% 63.161us 21.054us 7.936us 40.59% 7.936us 2.645us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.936us 40.59% 7.936us 2.645us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.080us 31.09% 6.080us 2.027us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.537us 28.32% 5.537us 1.846us 3 - Activity Buffer Request 76.56% 1.454ms 76.56% 1.454ms 1.454ms 2.112us 10.80% 2.112us 2.112us 1 - aten::empty_strided 1.59% 30.260us 1.59% 30.260us 5.043us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 3.84% 72.993us 3.84% 72.993us 8.110us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.96% 18.220us 1.27% 24.051us 2.672us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.50% 9.451us 0.50% 9.451us 0.630us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.52% 9.960us 0.52% 9.960us 3.320us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.48% 9.030us 0.48% 9.030us 3.010us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.31% 5.890us 0.39% 7.340us 2.447us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 341.789us 1742.49% 341.789us 341.789us 1 + torch_eager 7.86% 151.082us 99.71% 1.916ms 1.916ms 0.000us 0.00% 21.695us 21.695us 1 + aten::to 0.35% 6.661us 83.96% 1.614ms 268.966us 0.000us 0.00% 13.695us 2.282us 6 + aten::_to_copy 1.29% 24.781us 83.61% 1.607ms 267.856us 0.000us 0.00% 13.695us 2.282us 6 + aten::copy_ 2.59% 49.784us 80.72% 1.552ms 258.589us 11.615us 59.21% 13.695us 2.282us 6 + aten::conv1d 0.32% 6.220us 6.35% 122.113us 40.704us 0.000us 0.00% 8.000us 2.667us 3 + aten::convolution 0.53% 10.120us 6.03% 115.893us 38.631us 0.000us 0.00% 8.000us 2.667us 3 + aten::_convolution 1.20% 23.080us 5.50% 105.773us 35.258us 0.000us 0.00% 8.000us 2.667us 3 + aten::_conv_depthwise2d 1.19% 22.952us 3.39% 65.123us 21.708us 8.000us 40.79% 8.000us 2.667us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 8.000us 40.79% 8.000us 2.667us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.047us 30.83% 6.047us 2.016us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.568us 28.39% 5.568us 1.856us 3 + Activity Buffer Request 75.54% 1.452ms 75.54% 1.452ms 1.452ms 2.080us 10.60% 2.080us 2.080us 1 + aten::empty_strided 1.60% 30.820us 1.60% 30.820us 5.137us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 3.74% 71.953us 3.74% 71.953us 7.995us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.98% 18.881us 1.29% 24.750us 2.750us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.50% 9.609us 0.50% 9.609us 0.641us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.56% 10.750us 0.56% 10.750us 3.583us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.49% 9.339us 0.49% 9.339us 3.113us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.34% 6.630us 0.42% 8.000us 2.667us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.899ms -Self CUDA time total: 19.553us +Self CPU time total: 1.922ms +Self CUDA time total: 19.615us @@ -4281,29 +4281,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 333.726us 1795.19% 333.726us 333.726us 1 - torch_eager 6.76% 126.472us 99.71% 1.865ms 1.865ms 0.000us 0.00% 20.510us 20.510us 1 - aten::to 0.32% 5.970us 85.12% 1.592ms 265.378us 0.000us 0.00% 13.598us 2.266us 6 - aten::_to_copy 1.26% 23.561us 84.80% 1.586ms 264.383us 0.000us 0.00% 13.598us 2.266us 6 - aten::copy_ 2.75% 51.371us 81.92% 1.532ms 255.399us 11.678us 62.82% 13.598us 2.266us 6 - aten::conv1d 0.31% 5.850us 6.37% 119.083us 39.694us 0.000us 0.00% 6.912us 2.304us 3 - aten::convolution 0.54% 10.170us 6.05% 113.233us 37.744us 0.000us 0.00% 6.912us 2.304us 3 - aten::_convolution 1.25% 23.320us 5.51% 103.063us 34.354us 0.000us 0.00% 6.912us 2.304us 3 - aten::_conv_depthwise2d 1.20% 22.402us 3.41% 63.713us 21.238us 6.912us 37.18% 6.912us 2.304us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 6.912us 37.18% 6.912us 2.304us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.951us 32.01% 5.951us 1.984us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.727us 30.81% 5.727us 1.909us 3 - Activity Buffer Request 76.63% 1.433ms 76.63% 1.433ms 1.433ms 1.920us 10.33% 1.920us 1.920us 1 - aten::empty_strided 1.62% 30.340us 1.62% 30.340us 5.057us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 3.76% 70.302us 3.76% 70.302us 7.811us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.94% 17.590us 1.23% 22.950us 2.550us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.48% 8.970us 0.48% 8.970us 0.598us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.54% 10.051us 0.54% 10.051us 3.350us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.46% 8.519us 0.46% 8.519us 2.840us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.32% 5.980us 0.39% 7.380us 2.460us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 343.328us 1837.45% 343.328us 343.328us 1 + torch_eager 7.88% 151.015us 99.69% 1.911ms 1.911ms 0.000us 0.00% 20.605us 20.605us 1 + aten::to 0.33% 6.409us 84.02% 1.611ms 268.468us 0.000us 0.00% 13.662us 2.277us 6 + aten::_to_copy 1.32% 25.354us 83.68% 1.604ms 267.400us 0.000us 0.00% 13.662us 2.277us 6 + aten::copy_ 2.65% 50.770us 80.80% 1.549ms 258.170us 11.742us 62.84% 13.662us 2.277us 6 + aten::conv1d 0.33% 6.290us 6.34% 121.483us 40.494us 0.000us 0.00% 6.943us 2.314us 3 + aten::convolution 0.54% 10.430us 6.01% 115.193us 38.398us 0.000us 0.00% 6.943us 2.314us 3 + aten::_convolution 1.17% 22.439us 5.46% 104.763us 34.921us 0.000us 0.00% 6.943us 2.314us 3 + aten::_conv_depthwise2d 1.17% 22.412us 3.43% 65.843us 21.948us 6.943us 37.16% 6.943us 2.314us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 6.943us 37.16% 6.943us 2.314us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.982us 32.01% 5.982us 1.994us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.760us 30.83% 5.760us 1.920us 3 + Activity Buffer Request 75.50% 1.448ms 75.50% 1.448ms 1.448ms 1.920us 10.28% 1.920us 1.920us 1 + aten::empty_strided 1.57% 30.029us 1.57% 30.029us 5.005us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 3.90% 74.680us 3.90% 74.680us 8.298us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.93% 17.782us 1.21% 23.252us 2.584us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.48% 9.281us 0.48% 9.281us 0.619us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.57% 10.910us 0.57% 10.910us 3.637us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.44% 8.531us 0.44% 8.531us 2.844us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.32% 6.170us 0.39% 7.570us 2.523us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.871ms -Self CUDA time total: 18.590us +Self CPU time total: 1.917ms +Self CUDA time total: 18.685us @@ -4313,29 +4313,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 339.229us 1732.17% 339.229us 339.229us 1 - torch_eager 6.09% 126.194us 99.75% 2.066ms 2.066ms 0.000us 0.00% 21.729us 21.729us 1 - aten::to 0.29% 6.100us 86.58% 1.793ms 298.900us 0.000us 0.00% 14.018us 2.336us 6 - aten::_to_copy 1.16% 23.990us 86.28% 1.787ms 297.883us 0.000us 0.00% 14.018us 2.336us 6 - aten::copy_ 2.58% 53.448us 83.67% 1.733ms 288.850us 11.873us 60.63% 14.018us 2.336us 6 - aten::conv1d 0.32% 6.580us 5.73% 118.763us 39.588us 0.000us 0.00% 7.711us 2.570us 3 - aten::convolution 0.48% 9.870us 5.42% 112.183us 37.394us 0.000us 0.00% 7.711us 2.570us 3 - aten::_convolution 1.09% 22.580us 4.94% 102.313us 34.104us 0.000us 0.00% 7.711us 2.570us 3 - aten::_conv_depthwise2d 1.08% 22.411us 3.09% 64.033us 21.344us 7.711us 39.37% 7.711us 2.570us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.711us 39.37% 7.711us 2.570us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.145us 31.38% 6.145us 2.048us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.728us 29.25% 5.728us 1.909us 3 - Activity Buffer Request 69.66% 1.443ms 69.66% 1.443ms 1.443ms 2.145us 10.95% 2.145us 2.145us 1 - aten::empty_strided 1.46% 30.210us 1.46% 30.210us 5.035us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 12.49% 258.686us 12.49% 258.686us 28.743us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.87% 18.050us 1.12% 23.200us 2.578us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.42% 8.720us 0.42% 8.720us 0.581us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.49% 10.140us 0.49% 10.140us 3.380us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.46% 9.442us 0.46% 9.442us 3.147us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.28% 5.830us 0.35% 7.220us 2.407us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 340.280us 1734.88% 340.280us 340.280us 1 + torch_eager 6.89% 141.563us 99.72% 2.049ms 2.049ms 0.000us 0.00% 21.726us 21.726us 1 + aten::to 0.30% 6.132us 85.38% 1.755ms 292.424us 0.000us 0.00% 13.982us 2.330us 6 + aten::_to_copy 1.19% 24.439us 85.08% 1.748ms 291.402us 0.000us 0.00% 13.982us 2.330us 6 + aten::copy_ 2.50% 51.302us 82.39% 1.693ms 282.182us 11.870us 60.52% 13.982us 2.330us 6 + aten::conv1d 0.29% 5.930us 5.97% 122.723us 40.908us 0.000us 0.00% 7.744us 2.581us 3 + aten::convolution 0.50% 10.300us 5.68% 116.793us 38.931us 0.000us 0.00% 7.744us 2.581us 3 + aten::_convolution 1.17% 23.960us 5.18% 106.493us 35.498us 0.000us 0.00% 7.744us 2.581us 3 + aten::_conv_depthwise2d 1.08% 22.141us 3.19% 65.452us 21.817us 7.744us 39.48% 7.744us 2.581us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.744us 39.48% 7.744us 2.581us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.143us 31.32% 6.143us 2.048us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.727us 29.20% 5.727us 1.909us 3 + Activity Buffer Request 70.00% 1.438ms 70.00% 1.438ms 1.438ms 2.112us 10.77% 2.112us 2.112us 1 + aten::empty_strided 1.50% 30.881us 1.50% 30.881us 5.147us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 11.01% 226.194us 11.01% 226.194us 25.133us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.89% 18.302us 1.19% 24.432us 2.715us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.49% 9.981us 0.49% 9.981us 0.665us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.55% 11.260us 0.55% 11.260us 3.753us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.45% 9.171us 0.45% 9.171us 3.057us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.32% 6.620us 0.39% 8.030us 2.677us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.071ms -Self CUDA time total: 19.584us +Self CPU time total: 2.055ms +Self CUDA time total: 19.614us @@ -4345,29 +4345,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 342.208us 1399.74% 342.208us 342.208us 1 - torch_eager 6.21% 125.160us 99.74% 2.012ms 2.012ms 0.000us 0.00% 26.720us 26.720us 1 - aten::to 0.29% 5.910us 86.35% 1.742ms 290.270us 0.000us 0.00% 15.168us 2.528us 6 - aten::_to_copy 1.25% 25.122us 86.06% 1.736ms 289.285us 0.000us 0.00% 15.168us 2.528us 6 - aten::copy_ 2.93% 59.190us 83.27% 1.679ms 279.905us 12.896us 52.75% 15.168us 2.528us 6 - aten::conv1d 0.28% 5.620us 5.81% 117.132us 39.044us 0.000us 0.00% 11.552us 3.851us 3 - aten::convolution 0.49% 9.910us 5.53% 111.512us 37.171us 0.000us 0.00% 11.552us 3.851us 3 - aten::_convolution 1.15% 23.280us 5.04% 101.602us 33.867us 0.000us 0.00% 11.552us 3.851us 3 - aten::_conv_depthwise2d 1.09% 21.990us 3.08% 62.201us 20.734us 11.552us 47.25% 11.552us 3.851us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 11.552us 47.25% 11.552us 3.851us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.624us 27.09% 6.624us 2.208us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.272us 25.65% 6.272us 2.091us 3 - Activity Buffer Request 71.09% 1.434ms 71.09% 1.434ms 1.434ms 2.272us 9.29% 2.272us 2.272us 1 - aten::empty_strided 1.55% 31.162us 1.55% 31.162us 5.194us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 10.29% 207.543us 10.29% 207.543us 23.060us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.90% 18.220us 1.17% 23.681us 2.631us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.44% 8.971us 0.44% 8.971us 0.598us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.49% 9.951us 0.49% 9.951us 3.317us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.46% 9.230us 0.46% 9.230us 3.077us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.29% 5.780us 0.35% 7.150us 2.383us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 379.964us 1548.03% 379.964us 379.964us 1 + torch_eager 7.69% 160.944us 99.76% 2.089ms 2.089ms 0.000us 0.00% 26.817us 26.817us 1 + aten::to 0.33% 7.000us 83.76% 1.754ms 292.349us 0.000us 0.00% 15.265us 2.544us 6 + aten::_to_copy 1.23% 25.779us 83.43% 1.747ms 291.183us 0.000us 0.00% 15.265us 2.544us 6 + aten::copy_ 2.49% 52.100us 80.65% 1.689ms 281.484us 12.993us 52.94% 15.265us 2.544us 6 + aten::conv1d 0.31% 6.410us 6.85% 143.364us 47.788us 0.000us 0.00% 11.552us 3.851us 3 + aten::convolution 1.48% 31.021us 6.54% 136.954us 45.651us 0.000us 0.00% 11.552us 3.851us 3 + aten::_convolution 1.13% 23.621us 5.06% 105.933us 35.311us 0.000us 0.00% 11.552us 3.851us 3 + aten::_conv_depthwise2d 1.06% 22.209us 3.13% 65.632us 21.877us 11.552us 47.06% 11.552us 3.851us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 11.552us 47.06% 11.552us 3.851us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.625us 26.99% 6.625us 2.208us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.368us 25.94% 6.368us 2.123us 3 + Activity Buffer Request 68.76% 1.440ms 68.76% 1.440ms 1.440ms 2.272us 9.26% 2.272us 2.272us 1 + aten::empty_strided 1.55% 32.413us 1.55% 32.413us 5.402us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 10.50% 219.817us 10.50% 219.817us 24.424us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.87% 18.301us 1.15% 24.061us 2.673us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.50% 10.530us 0.50% 10.530us 0.702us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.50% 10.490us 0.50% 10.490us 3.497us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.47% 9.872us 0.47% 9.872us 3.291us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.30% 6.220us 0.37% 7.740us 2.580us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.017ms -Self CUDA time total: 24.448us +Self CPU time total: 2.094ms +Self CUDA time total: 24.545us @@ -4377,29 +4377,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 360.702us 1391.60% 360.702us 360.702us 1 - torch_eager 7.02% 142.940us 99.74% 2.030ms 2.030ms 0.000us 0.00% 28.128us 28.128us 1 - aten::to 0.30% 6.030us 85.23% 1.734ms 289.050us 0.000us 0.00% 15.136us 2.523us 6 - aten::_to_copy 1.18% 23.913us 84.93% 1.728ms 288.045us 0.000us 0.00% 15.136us 2.523us 6 - aten::copy_ 2.60% 52.858us 82.24% 1.673ms 278.911us 12.928us 49.88% 15.136us 2.523us 6 - aten::conv1d 0.29% 5.931us 6.05% 123.062us 41.021us 0.000us 0.00% 12.992us 4.331us 3 - aten::convolution 0.49% 10.049us 5.76% 117.131us 39.044us 0.000us 0.00% 12.992us 4.331us 3 - aten::_convolution 1.15% 23.381us 5.26% 107.082us 35.694us 0.000us 0.00% 12.992us 4.331us 3 - aten::_conv_depthwise2d 1.11% 22.652us 3.33% 67.801us 22.600us 12.992us 50.12% 12.992us 4.331us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 12.992us 50.12% 12.992us 4.331us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.592us 25.43% 6.592us 2.197us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.336us 24.44% 6.336us 2.112us 3 - Activity Buffer Request 70.88% 1.442ms 70.88% 1.442ms 1.442ms 2.208us 8.52% 2.208us 2.208us 1 - aten::empty_strided 1.52% 30.891us 1.52% 30.891us 5.148us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 10.00% 203.394us 10.00% 203.394us 22.599us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.92% 18.741us 1.20% 24.361us 2.707us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.46% 9.330us 0.46% 9.330us 0.622us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.51% 10.450us 0.51% 10.450us 3.483us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.47% 9.490us 0.47% 9.490us 3.163us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.29% 5.900us 0.36% 7.380us 2.460us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 351.133us 1341.43% 351.133us 351.133us 1 + torch_eager 7.55% 157.812us 99.73% 2.084ms 2.084ms 0.000us 0.00% 28.416us 28.416us 1 + aten::to 0.31% 6.571us 84.80% 1.772ms 295.318us 0.000us 0.00% 15.264us 2.544us 6 + aten::_to_copy 1.22% 25.450us 84.49% 1.765ms 294.223us 0.000us 0.00% 15.264us 2.544us 6 + aten::copy_ 2.31% 48.301us 81.82% 1.710ms 284.947us 13.024us 49.76% 15.264us 2.544us 6 + aten::conv1d 0.32% 6.640us 5.96% 124.543us 41.514us 0.000us 0.00% 13.152us 4.384us 3 + aten::convolution 0.50% 10.360us 5.64% 117.903us 39.301us 0.000us 0.00% 13.152us 4.384us 3 + aten::_convolution 1.16% 24.330us 5.15% 107.543us 35.848us 0.000us 0.00% 13.152us 4.384us 3 + aten::_conv_depthwise2d 1.06% 22.241us 3.14% 65.623us 21.874us 13.152us 50.24% 13.152us 4.384us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 13.152us 50.24% 13.152us 4.384us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.656us 25.43% 6.656us 2.219us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.368us 24.33% 6.368us 2.123us 3 + Activity Buffer Request 70.10% 1.465ms 70.10% 1.465ms 1.465ms 2.240us 8.56% 2.240us 2.240us 1 + aten::empty_strided 1.45% 30.202us 1.45% 30.202us 5.034us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 10.51% 219.677us 10.51% 219.677us 24.409us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.90% 18.881us 1.17% 24.421us 2.713us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.46% 9.580us 0.46% 9.580us 0.639us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.55% 11.471us 0.55% 11.471us 3.824us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.43% 8.890us 0.43% 8.890us 2.963us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.33% 6.950us 0.40% 8.400us 2.800us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.035ms -Self CUDA time total: 25.920us +Self CPU time total: 2.089ms +Self CUDA time total: 26.176us @@ -4409,29 +4409,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 369.628us 962.57% 369.628us 369.628us 1 - torch_eager 7.12% 161.009us 99.76% 2.255ms 2.255ms 0.000us 0.00% 40.960us 40.960us 1 - aten::conv1d 0.32% 7.222us 5.82% 131.613us 43.871us 0.000us 0.00% 22.528us 7.509us 3 - aten::convolution 0.54% 12.229us 5.50% 124.391us 41.464us 0.000us 0.00% 22.528us 7.509us 3 - aten::_convolution 1.15% 26.031us 4.96% 112.162us 37.387us 0.000us 0.00% 22.528us 7.509us 3 - aten::_conv_depthwise2d 1.09% 24.630us 3.00% 67.820us 22.607us 22.528us 58.67% 22.528us 7.509us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 22.528us 58.67% 22.528us 7.509us 3 - aten::to 0.34% 7.671us 85.42% 1.931ms 321.787us 0.000us 0.00% 18.432us 3.072us 6 - aten::_to_copy 1.41% 31.890us 85.08% 1.923ms 320.509us 0.000us 0.00% 18.432us 3.072us 6 - aten::copy_ 2.64% 59.711us 82.13% 1.856ms 309.384us 15.872us 41.33% 18.432us 3.072us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.544us 22.25% 8.544us 2.848us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.328us 19.08% 7.328us 2.443us 3 - Activity Buffer Request 64.20% 1.451ms 64.20% 1.451ms 1.451ms 2.560us 6.67% 2.560us 2.560us 1 - aten::empty_strided 1.54% 34.861us 1.54% 34.861us 5.810us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 16.32% 368.786us 16.32% 368.786us 40.976us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.93% 20.991us 1.15% 26.100us 2.900us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.41% 9.319us 0.41% 9.319us 0.621us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.44% 9.850us 0.44% 9.850us 3.283us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.44% 9.970us 0.44% 9.970us 3.323us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.31% 7.041us 0.38% 8.701us 2.900us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 349.627us 908.24% 349.627us 349.627us 1 + torch_eager 7.45% 152.992us 99.76% 2.049ms 2.049ms 0.000us 0.00% 41.086us 41.086us 1 + aten::conv1d 0.32% 6.640us 6.06% 124.413us 41.471us 0.000us 0.00% 22.561us 7.520us 3 + aten::convolution 0.50% 10.370us 5.73% 117.773us 39.258us 0.000us 0.00% 22.561us 7.520us 3 + aten::_convolution 1.14% 23.411us 5.23% 107.403us 35.801us 0.000us 0.00% 22.561us 7.520us 3 + aten::_conv_depthwise2d 1.15% 23.650us 3.29% 67.532us 22.511us 22.561us 58.61% 22.561us 7.520us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 22.561us 58.61% 22.561us 7.520us 3 + aten::to 0.33% 6.780us 84.82% 1.743ms 290.446us 0.000us 0.00% 18.525us 3.087us 6 + aten::_to_copy 1.29% 26.502us 84.49% 1.736ms 289.316us 0.000us 0.00% 18.525us 3.087us 6 + aten::copy_ 2.40% 49.251us 81.74% 1.679ms 279.869us 15.934us 41.39% 18.525us 3.087us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.543us 22.19% 8.543us 2.848us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.391us 19.20% 7.391us 2.464us 3 + Activity Buffer Request 69.84% 1.435ms 69.84% 1.435ms 1.435ms 2.591us 6.73% 2.591us 2.591us 1 + aten::empty_strided 1.47% 30.182us 1.47% 30.182us 5.030us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 10.64% 218.664us 10.64% 218.664us 24.296us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.89% 18.281us 1.17% 24.011us 2.668us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.47% 9.739us 0.47% 9.739us 0.649us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.53% 10.991us 0.53% 10.991us 3.664us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.46% 9.421us 0.46% 9.421us 3.140us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.29% 5.970us 0.36% 7.320us 2.440us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.260ms -Self CUDA time total: 38.400us +Self CPU time total: 2.054ms +Self CUDA time total: 38.495us @@ -4441,29 +4441,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 343.007us 838.09% 343.007us 343.007us 1 - torch_eager 6.47% 141.163us 99.73% 2.175ms 2.175ms 0.000us 0.00% 43.487us 43.487us 1 - aten::conv1d 0.27% 5.870us 5.52% 120.313us 40.104us 0.000us 0.00% 25.376us 8.459us 3 - aten::convolution 0.46% 10.120us 5.25% 114.443us 38.148us 0.000us 0.00% 25.376us 8.459us 3 - aten::_convolution 1.12% 24.490us 4.78% 104.323us 34.774us 0.000us 0.00% 25.376us 8.459us 3 - aten::_conv_depthwise2d 1.00% 21.702us 2.89% 62.963us 20.988us 25.376us 62.00% 25.376us 8.459us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 25.376us 62.00% 25.376us 8.459us 3 - aten::to 0.28% 6.129us 86.46% 1.885ms 314.232us 0.000us 0.00% 18.111us 3.018us 6 - aten::_to_copy 1.13% 24.640us 86.18% 1.879ms 313.211us 0.000us 0.00% 18.111us 3.018us 6 - aten::copy_ 2.51% 54.672us 83.58% 1.823ms 303.754us 15.551us 38.00% 18.111us 3.018us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.224us 20.09% 8.224us 2.741us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.327us 17.90% 7.327us 2.442us 3 - Activity Buffer Request 66.59% 1.452ms 66.59% 1.452ms 1.452ms 2.560us 6.26% 2.560us 2.560us 1 - aten::empty_strided 1.47% 32.100us 1.47% 32.100us 5.350us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 15.50% 338.007us 15.50% 338.007us 37.556us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.84% 18.320us 1.10% 24.070us 2.674us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.43% 9.420us 0.43% 9.420us 0.628us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.46% 10.080us 0.46% 10.080us 3.360us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.42% 9.080us 0.42% 9.080us 3.027us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.27% 5.960us 0.34% 7.390us 2.463us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 345.054us 837.81% 345.054us 345.054us 1 + torch_eager 7.39% 151.695us 99.75% 2.049ms 2.049ms 0.000us 0.00% 43.810us 43.810us 1 + aten::conv1d 0.32% 6.620us 6.03% 123.883us 41.294us 0.000us 0.00% 25.375us 8.458us 3 + aten::convolution 0.50% 10.320us 5.71% 117.263us 39.088us 0.000us 0.00% 25.375us 8.458us 3 + aten::_convolution 1.20% 24.592us 5.21% 106.943us 35.648us 0.000us 0.00% 25.375us 8.458us 3 + aten::_conv_depthwise2d 1.13% 23.150us 3.19% 65.451us 21.817us 25.375us 61.61% 25.375us 8.458us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 25.375us 61.61% 25.375us 8.458us 3 + aten::to 0.31% 6.440us 84.93% 1.744ms 290.716us 0.000us 0.00% 18.435us 3.072us 6 + aten::_to_copy 1.24% 25.501us 84.61% 1.738ms 289.642us 0.000us 0.00% 18.435us 3.072us 6 + aten::copy_ 2.41% 49.431us 81.91% 1.682ms 280.380us 15.810us 38.39% 18.435us 3.072us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.386us 20.36% 8.386us 2.795us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.424us 18.03% 7.424us 2.475us 3 + Activity Buffer Request 70.32% 1.444ms 70.32% 1.444ms 1.444ms 2.625us 6.37% 2.625us 2.625us 1 + aten::empty_strided 1.46% 30.070us 1.46% 30.070us 5.012us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 10.28% 211.144us 10.28% 211.144us 23.460us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.92% 18.949us 1.19% 24.411us 2.712us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.45% 9.313us 0.45% 9.313us 0.621us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.52% 10.601us 0.52% 10.601us 3.534us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.44% 9.110us 0.44% 9.110us 3.037us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.29% 5.930us 0.36% 7.410us 2.470us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.181ms -Self CUDA time total: 40.927us +Self CPU time total: 2.054ms +Self CUDA time total: 41.185us @@ -4473,29 +4473,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 367.004us 357.73% 367.004us 367.004us 1 - torch_eager 6.17% 126.763us 99.73% 2.049ms 2.049ms 0.000us 0.00% 108.512us 108.512us 1 - aten::conv1d 0.28% 5.761us 5.81% 119.372us 39.791us 0.000us 0.00% 70.432us 23.477us 3 - aten::convolution 0.48% 9.820us 5.53% 113.611us 37.870us 0.000us 0.00% 70.432us 23.477us 3 - aten::_convolution 1.11% 22.788us 5.05% 103.791us 34.597us 0.000us 0.00% 70.432us 23.477us 3 - aten::_conv_depthwise2d 1.12% 22.910us 3.14% 64.601us 21.534us 70.432us 68.65% 70.432us 23.477us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 70.432us 68.65% 70.432us 23.477us 3 - aten::to 0.30% 6.130us 86.37% 1.774ms 295.680us 0.000us 0.00% 38.080us 6.347us 6 - aten::_to_copy 2.18% 44.819us 86.07% 1.768ms 294.658us 0.000us 0.00% 38.080us 6.347us 6 - aten::copy_ 2.56% 52.622us 82.32% 1.691ms 281.815us 32.160us 31.35% 38.080us 6.347us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 17.504us 17.06% 17.504us 5.835us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 14.656us 14.29% 14.656us 4.885us 3 - Activity Buffer Request 69.77% 1.433ms 69.77% 1.433ms 1.433ms 5.920us 5.77% 5.920us 5.920us 1 - aten::empty_strided 1.57% 32.241us 1.57% 32.241us 5.373us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 11.08% 227.645us 11.08% 227.645us 25.294us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.87% 17.849us 1.12% 23.070us 2.563us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.44% 9.030us 0.44% 9.030us 0.602us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.49% 10.050us 0.49% 10.050us 3.350us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.44% 9.040us 0.44% 9.040us 3.013us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.30% 6.163us 0.38% 7.782us 2.594us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 348.348us 338.39% 348.348us 348.348us 1 + torch_eager 7.21% 148.863us 99.73% 2.059ms 2.059ms 0.000us 0.00% 108.926us 108.926us 1 + aten::conv1d 0.31% 6.430us 5.95% 122.893us 40.964us 0.000us 0.00% 70.592us 23.531us 3 + aten::convolution 0.50% 10.290us 5.64% 116.463us 38.821us 0.000us 0.00% 70.592us 23.531us 3 + aten::_convolution 1.17% 24.211us 5.14% 106.173us 35.391us 0.000us 0.00% 70.592us 23.531us 3 + aten::_conv_depthwise2d 1.12% 23.052us 3.16% 65.282us 21.761us 70.592us 68.57% 70.592us 23.531us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 70.592us 68.57% 70.592us 23.531us 3 + aten::to 0.31% 6.372us 85.15% 1.758ms 292.949us 0.000us 0.00% 38.334us 6.389us 6 + aten::_to_copy 1.20% 24.680us 84.84% 1.751ms 291.887us 0.000us 0.00% 38.334us 6.389us 6 + aten::copy_ 2.47% 51.072us 82.20% 1.697ms 282.787us 32.350us 31.43% 38.334us 6.389us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 17.695us 17.19% 17.695us 5.898us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 14.655us 14.24% 14.655us 4.885us 3 + Activity Buffer Request 70.59% 1.457ms 70.59% 1.457ms 1.457ms 5.984us 5.81% 5.984us 5.984us 1 + aten::empty_strided 1.45% 29.921us 1.45% 29.921us 4.987us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 10.23% 211.264us 10.23% 211.264us 23.474us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.89% 18.462us 1.17% 24.111us 2.679us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.47% 9.709us 0.47% 9.709us 0.647us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.47% 9.780us 0.47% 9.780us 3.260us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.47% 9.740us 0.47% 9.740us 3.247us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.28% 5.880us 0.35% 7.260us 2.420us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.054ms -Self CUDA time total: 102.592us +Self CPU time total: 2.064ms +Self CUDA time total: 102.942us @@ -4505,29 +4505,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 336.959us 299.49% 336.959us 336.959us 1 - torch_eager 6.25% 125.522us 99.75% 2.004ms 2.004ms 0.000us 0.00% 118.493us 118.493us 1 - aten::conv1d 0.38% 7.700us 5.98% 120.223us 40.074us 0.000us 0.00% 80.479us 26.826us 3 - aten::convolution 0.49% 9.780us 5.60% 112.523us 37.508us 0.000us 0.00% 80.479us 26.826us 3 - aten::_convolution 1.13% 22.669us 5.11% 102.743us 34.248us 0.000us 0.00% 80.479us 26.826us 3 - aten::_conv_depthwise2d 1.12% 22.452us 3.19% 64.073us 21.358us 80.479us 71.53% 80.479us 26.826us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 80.479us 71.53% 80.479us 26.826us 3 - aten::to 0.29% 5.910us 86.14% 1.731ms 288.442us 0.000us 0.00% 38.014us 6.336us 6 - aten::_to_copy 1.19% 24.001us 85.85% 1.725ms 287.457us 0.000us 0.00% 38.014us 6.336us 6 - aten::copy_ 2.56% 51.481us 83.17% 1.671ms 278.473us 32.031us 28.47% 38.014us 6.336us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 17.567us 15.61% 17.567us 5.856us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 14.464us 12.86% 14.464us 4.821us 3 - Activity Buffer Request 71.72% 1.441ms 71.72% 1.441ms 1.441ms 5.983us 5.32% 5.983us 5.983us 1 - aten::empty_strided 1.49% 29.901us 1.49% 29.901us 4.983us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 10.00% 200.814us 10.00% 200.814us 22.313us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.89% 17.861us 1.15% 23.111us 2.568us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.45% 8.970us 0.45% 8.970us 0.598us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.50% 10.050us 0.50% 10.050us 3.350us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.46% 9.169us 0.46% 9.169us 3.056us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.30% 6.030us 0.38% 7.560us 2.520us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 344.181us 304.53% 344.181us 344.181us 1 + torch_eager 14.98% 124.863us 99.35% 828.302us 828.302us 0.000us 0.00% 119.036us 119.036us 1 + aten::conv1d 0.70% 5.870us 14.55% 121.343us 40.448us 0.000us 0.00% 80.669us 26.890us 3 + aten::convolution 1.17% 9.720us 13.85% 115.473us 38.491us 0.000us 0.00% 80.669us 26.890us 3 + aten::_convolution 2.96% 24.691us 12.68% 105.753us 35.251us 0.000us 0.00% 80.669us 26.890us 3 + aten::_conv_depthwise2d 2.65% 22.121us 7.65% 63.762us 21.254us 80.669us 71.38% 80.669us 26.890us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 80.669us 71.38% 80.669us 26.890us 3 + aten::to 0.77% 6.429us 66.53% 554.705us 92.451us 0.000us 0.00% 38.367us 6.394us 6 + aten::_to_copy 3.01% 25.101us 65.76% 548.276us 91.379us 0.000us 0.00% 38.367us 6.394us 6 + aten::copy_ 6.16% 51.352us 59.05% 492.343us 82.057us 32.351us 28.62% 38.367us 6.394us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 17.696us 15.66% 17.696us 5.899us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 14.655us 12.97% 14.655us 4.885us 3 + Activity Buffer Request 28.81% 240.197us 28.81% 240.197us 240.197us 6.016us 5.32% 6.016us 6.016us 1 + aten::empty_strided 3.70% 30.832us 3.70% 30.832us 5.139us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 26.65% 222.174us 26.65% 222.174us 24.686us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 2.09% 17.401us 2.70% 22.541us 2.505us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 1.05% 8.790us 1.05% 8.790us 0.586us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.34% 11.151us 1.34% 11.151us 3.717us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 1.09% 9.110us 1.09% 9.110us 3.037us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.89% 7.450us 1.05% 8.790us 2.930us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.009ms -Self CUDA time total: 112.510us +Self CPU time total: 833.752us +Self CUDA time total: 113.020us @@ -4537,29 +4537,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 5.98% 122.945us 97.76% 2.011ms 2.011ms 0.000us 0.00% 433.437us 433.437us 1 - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 423.709us 107.83% 423.709us 423.709us 1 - aten::conv1d 0.28% 5.760us 5.73% 117.851us 39.284us 0.000us 0.00% 250.941us 83.647us 3 - aten::convolution 0.48% 9.830us 5.45% 112.091us 37.364us 0.000us 0.00% 250.941us 83.647us 3 - aten::_convolution 1.12% 23.111us 4.97% 102.261us 34.087us 0.000us 0.00% 250.941us 83.647us 3 - aten::_conv_depthwise2d 1.03% 21.200us 3.03% 62.360us 20.787us 250.941us 63.86% 250.941us 83.647us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 250.941us 63.86% 250.941us 83.647us 3 - aten::to 0.28% 5.851us 84.70% 1.742ms 290.313us 0.000us 0.00% 182.496us 30.416us 6 - aten::_to_copy 1.16% 23.919us 84.41% 1.736ms 289.338us 0.000us 0.00% 182.496us 30.416us 6 - aten::copy_ 2.53% 51.981us 81.78% 1.682ms 280.333us 142.016us 36.14% 182.496us 30.416us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 101.952us 25.94% 101.952us 33.984us 3 - Activity Buffer Request 70.64% 1.453ms 70.64% 1.453ms 1.453ms 40.480us 10.30% 40.480us 40.480us 1 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.064us 10.20% 40.064us 13.355us 3 - aten::empty_strided 1.46% 30.112us 1.46% 30.112us 5.019us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 9.67% 198.853us 9.67% 198.853us 22.095us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.91% 18.669us 1.18% 24.270us 2.697us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.44% 9.151us 0.44% 9.151us 0.610us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.48% 9.870us 0.48% 9.870us 3.290us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.47% 9.710us 0.47% 9.710us 3.237us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.29% 5.960us 0.36% 7.350us 2.450us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 14.21% 122.455us 95.83% 825.681us 825.681us 0.000us 0.00% 433.339us 433.339us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 419.771us 106.59% 419.771us 419.771us 1 + aten::conv1d 0.75% 6.429us 14.10% 121.522us 40.507us 0.000us 0.00% 251.453us 83.818us 3 + aten::convolution 1.15% 9.929us 13.36% 115.093us 38.364us 0.000us 0.00% 251.453us 83.818us 3 + aten::_convolution 2.67% 23.042us 12.21% 105.164us 35.055us 0.000us 0.00% 251.453us 83.818us 3 + aten::_conv_depthwise2d 2.60% 22.440us 7.52% 64.810us 21.603us 251.453us 63.85% 251.453us 83.818us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 251.453us 63.85% 251.453us 83.818us 3 + aten::to 0.70% 6.001us 64.14% 552.672us 92.112us 0.000us 0.00% 181.886us 30.314us 6 + aten::_to_copy 2.73% 23.540us 63.45% 546.671us 91.112us 0.000us 0.00% 181.886us 30.314us 6 + aten::copy_ 5.94% 51.140us 57.36% 494.211us 82.368us 142.367us 36.15% 181.886us 30.314us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 102.367us 25.99% 102.367us 34.122us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.000us 10.16% 40.000us 13.333us 3 + Activity Buffer Request 29.04% 250.247us 29.04% 250.247us 250.247us 39.519us 10.03% 39.519us 39.519us 1 + aten::empty_strided 3.36% 28.920us 3.36% 28.920us 4.820us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 24.89% 214.494us 24.89% 214.494us 23.833us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 1.98% 17.062us 2.59% 22.273us 2.475us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 1.09% 9.391us 1.09% 9.391us 0.626us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.24% 10.660us 1.24% 10.660us 3.553us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 1.17% 10.040us 1.17% 10.040us 3.347us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.86% 7.370us 1.02% 8.800us 2.933us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.057ms -Self CUDA time total: 392.957us +Self CPU time total: 861.602us +Self CUDA time total: 393.820us @@ -4569,29 +4569,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 5.86% 122.119us 95.18% 1.984ms 1.984ms 0.000us 0.00% 485.373us 485.373us 1 - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 475.549us 106.61% 475.549us 475.549us 1 - aten::conv1d 0.29% 6.020us 5.58% 116.291us 38.764us 0.000us 0.00% 298.429us 99.476us 3 - aten::convolution 0.46% 9.580us 5.29% 110.271us 36.757us 0.000us 0.00% 298.429us 99.476us 3 - aten::_convolution 1.07% 22.391us 4.83% 100.691us 33.564us 0.000us 0.00% 298.429us 99.476us 3 - aten::_conv_depthwise2d 1.02% 21.160us 3.01% 62.730us 20.910us 298.429us 66.91% 298.429us 99.476us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 298.429us 66.91% 298.429us 99.476us 3 - aten::to 0.28% 5.929us 82.40% 1.718ms 286.300us 0.000us 0.00% 186.944us 31.157us 6 - aten::_to_copy 1.13% 23.472us 82.12% 1.712ms 285.312us 0.000us 0.00% 186.944us 31.157us 6 - aten::copy_ 2.45% 51.061us 79.57% 1.659ms 276.443us 147.616us 33.09% 186.944us 31.157us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 107.712us 24.15% 107.712us 35.904us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 39.904us 8.95% 39.904us 13.301us 3 - Activity Buffer Request 68.65% 1.431ms 68.65% 1.431ms 1.431ms 39.328us 8.82% 39.328us 39.328us 1 - aten::empty_strided 1.43% 29.742us 1.43% 29.742us 4.957us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 9.54% 198.903us 9.54% 198.903us 22.100us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.85% 17.731us 1.11% 23.210us 2.579us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.44% 9.210us 0.44% 9.210us 0.614us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.47% 9.850us 0.47% 9.850us 3.283us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.45% 9.320us 0.45% 9.320us 3.107us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.28% 5.850us 0.35% 7.270us 2.423us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 15.32% 134.312us 91.67% 803.971us 803.971us 0.000us 0.00% 487.924us 487.924us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 476.501us 106.34% 476.501us 476.501us 1 + aten::conv1d 0.67% 5.860us 13.82% 121.173us 40.391us 0.000us 0.00% 299.161us 99.720us 3 + aten::convolution 1.17% 10.220us 13.15% 115.313us 38.438us 0.000us 0.00% 299.161us 99.720us 3 + aten::_convolution 2.67% 23.450us 11.98% 105.093us 35.031us 0.000us 0.00% 299.161us 99.720us 3 + aten::_conv_depthwise2d 2.56% 22.451us 7.48% 65.623us 21.874us 299.161us 66.76% 299.161us 99.720us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 299.161us 66.76% 299.161us 99.720us 3 + aten::to 0.69% 6.051us 59.17% 518.906us 86.484us 0.000us 0.00% 188.763us 31.460us 6 + aten::_to_copy 2.71% 23.771us 58.48% 512.855us 85.476us 0.000us 0.00% 188.763us 31.460us 6 + aten::copy_ 5.69% 49.880us 52.31% 458.742us 76.457us 148.924us 33.24% 188.763us 31.460us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 108.861us 24.29% 108.861us 36.287us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.063us 8.94% 40.063us 13.354us 3 + Activity Buffer Request 25.01% 219.366us 25.01% 219.366us 219.366us 39.839us 8.89% 39.839us 39.839us 1 + aten::empty_strided 3.46% 30.342us 3.46% 30.342us 5.057us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 24.34% 213.439us 24.34% 213.439us 23.715us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 1.98% 17.400us 2.59% 22.720us 2.524us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 1.09% 9.540us 1.09% 9.540us 0.636us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.14% 10.010us 1.14% 10.010us 3.337us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 1.05% 9.219us 1.05% 9.219us 3.073us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.66% 5.750us 0.82% 7.210us 2.403us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.085ms -Self CUDA time total: 446.045us +Self CPU time total: 876.983us +Self CUDA time total: 448.085us @@ -4601,29 +4601,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 323.833us 1729.88% 323.833us 323.833us 1 - torch_eager 14.51% 116.191us 99.37% 795.884us 795.884us 0.000us 0.00% 20.608us 20.608us 1 - aten::to 0.75% 6.009us 67.15% 537.870us 89.645us 0.000us 0.00% 13.376us 2.229us 6 - aten::_to_copy 2.93% 23.471us 66.40% 531.861us 88.644us 0.000us 0.00% 13.376us 2.229us 6 - aten::copy_ 6.32% 50.599us 59.65% 477.769us 79.628us 11.488us 61.37% 13.376us 2.229us 6 - aten::conv1d 0.81% 6.510us 14.38% 115.173us 38.391us 0.000us 0.00% 7.232us 2.411us 3 - aten::convolution 1.28% 10.221us 13.57% 108.663us 36.221us 0.000us 0.00% 7.232us 2.411us 3 - aten::_convolution 2.73% 21.890us 12.29% 98.442us 32.814us 0.000us 0.00% 7.232us 2.411us 3 - aten::_conv_depthwise2d 2.76% 22.080us 7.70% 61.700us 20.567us 7.232us 38.63% 7.232us 2.411us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.232us 38.63% 7.232us 2.411us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.888us 31.45% 5.888us 1.963us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.600us 29.91% 5.600us 1.867us 3 - Activity Buffer Request 31.20% 249.924us 31.20% 249.924us 249.924us 1.888us 10.09% 1.888us 1.888us 1 - aten::empty_strided 3.82% 30.621us 3.82% 30.621us 5.103us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 24.75% 198.236us 24.75% 198.236us 22.026us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 2.09% 16.762us 2.71% 21.692us 2.410us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 1.04% 8.330us 1.04% 8.330us 0.555us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.15% 9.220us 1.15% 9.220us 3.073us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 1.17% 9.410us 1.17% 9.410us 3.137us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.72% 5.800us 0.89% 7.160us 2.387us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 338.392us 1804.85% 338.392us 338.392us 1 + torch_eager 18.33% 161.236us 99.35% 873.703us 873.703us 0.000us 0.00% 20.637us 20.637us 1 + aten::to 0.69% 6.070us 63.71% 560.224us 93.371us 0.000us 0.00% 13.406us 2.234us 6 + aten::_to_copy 2.78% 24.471us 63.02% 554.154us 92.359us 0.000us 0.00% 13.406us 2.234us 6 + aten::copy_ 5.94% 52.212us 56.85% 499.953us 83.325us 11.518us 61.43% 13.406us 2.234us 6 + aten::conv1d 0.64% 5.659us 14.02% 123.282us 41.094us 0.000us 0.00% 7.231us 2.410us 3 + aten::convolution 1.14% 9.999us 13.38% 117.623us 39.208us 0.000us 0.00% 7.231us 2.410us 3 + aten::_convolution 2.72% 23.952us 12.24% 107.624us 35.875us 0.000us 0.00% 7.231us 2.410us 3 + aten::_conv_depthwise2d 2.67% 23.519us 7.63% 67.130us 22.377us 7.231us 38.57% 7.231us 2.410us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.231us 38.57% 7.231us 2.410us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.854us 31.22% 5.854us 1.951us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.664us 30.21% 5.664us 1.888us 3 + Activity Buffer Request 29.52% 259.596us 29.52% 259.596us 259.596us 1.888us 10.07% 1.888us 1.888us 1 + aten::empty_strided 3.38% 29.730us 3.38% 29.730us 4.955us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 23.99% 210.946us 23.99% 210.946us 23.438us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 2.07% 18.190us 2.71% 23.871us 2.652us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 1.11% 9.761us 1.11% 9.761us 0.651us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.24% 10.890us 1.24% 10.890us 3.630us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 1.13% 9.920us 1.13% 9.920us 3.307us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.68% 5.972us 0.85% 7.452us 2.484us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 800.944us -Self CUDA time total: 18.720us +Self CPU time total: 879.393us +Self CUDA time total: 18.749us @@ -4633,29 +4633,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 324.666us 1676.91% 324.666us 324.666us 1 - torch_eager 15.17% 119.302us 99.37% 781.483us 781.483us 0.000us 0.00% 21.249us 21.249us 1 - aten::to 0.72% 5.648us 65.85% 517.928us 86.321us 0.000us 0.00% 13.345us 2.224us 6 - aten::_to_copy 2.87% 22.611us 65.14% 512.280us 85.380us 0.000us 0.00% 13.345us 2.224us 6 - aten::copy_ 6.22% 48.900us 58.49% 460.037us 76.673us 11.457us 59.18% 13.345us 2.224us 6 - aten::conv1d 0.87% 6.869us 14.99% 117.911us 39.304us 0.000us 0.00% 7.904us 2.635us 3 - aten::convolution 1.27% 10.002us 14.12% 111.042us 37.014us 0.000us 0.00% 7.904us 2.635us 3 - aten::_convolution 2.89% 22.710us 12.85% 101.040us 33.680us 0.000us 0.00% 7.904us 2.635us 3 - aten::_conv_depthwise2d 2.75% 21.590us 8.00% 62.920us 20.973us 7.904us 40.82% 7.904us 2.635us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.904us 40.82% 7.904us 2.635us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.825us 30.09% 5.825us 1.942us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.632us 29.09% 5.632us 1.877us 3 - Activity Buffer Request 30.25% 237.875us 30.25% 237.875us 237.875us 1.888us 9.75% 1.888us 1.888us 1 - aten::empty_strided 3.77% 29.632us 3.77% 29.632us 4.939us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 24.87% 195.612us 24.87% 195.612us 21.735us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 2.14% 16.821us 2.78% 21.881us 2.431us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 1.08% 8.481us 1.08% 8.481us 0.565us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.22% 9.600us 1.22% 9.600us 3.200us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 1.19% 9.380us 1.19% 9.380us 3.127us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.75% 5.869us 0.93% 7.280us 2.427us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 338.934us 1741.87% 338.934us 338.934us 1 + torch_eager 16.71% 145.362us 99.29% 863.592us 863.592us 0.000us 0.00% 21.314us 21.314us 1 + aten::to 0.71% 6.200us 65.36% 568.524us 94.754us 0.000us 0.00% 13.282us 2.214us 6 + aten::_to_copy 2.85% 24.831us 64.65% 562.324us 93.721us 0.000us 0.00% 13.282us 2.214us 6 + aten::copy_ 5.81% 50.550us 58.39% 507.883us 84.647us 11.426us 58.72% 13.282us 2.214us 6 + aten::conv1d 0.78% 6.753us 14.06% 122.315us 40.772us 0.000us 0.00% 8.032us 2.677us 3 + aten::convolution 1.19% 10.380us 13.29% 115.562us 38.521us 0.000us 0.00% 8.032us 2.677us 3 + aten::_convolution 2.63% 22.841us 12.09% 105.182us 35.061us 0.000us 0.00% 8.032us 2.677us 3 + aten::_conv_depthwise2d 2.65% 23.042us 7.65% 66.512us 22.171us 8.032us 41.28% 8.032us 2.677us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 8.032us 41.28% 8.032us 2.677us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.825us 29.94% 5.825us 1.942us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.601us 28.79% 5.601us 1.867us 3 + Activity Buffer Request 30.62% 266.307us 30.62% 266.307us 266.307us 1.856us 9.54% 1.856us 1.856us 1 + aten::empty_strided 3.40% 29.610us 3.40% 29.610us 4.935us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 24.61% 214.076us 24.61% 214.076us 23.786us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 2.02% 17.612us 2.63% 22.841us 2.538us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 1.02% 8.840us 1.02% 8.840us 0.589us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.22% 10.630us 1.22% 10.630us 3.543us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 1.13% 9.790us 1.13% 9.790us 3.263us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.67% 5.798us 0.82% 7.109us 2.370us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 786.473us -Self CUDA time total: 19.361us +Self CPU time total: 869.783us +Self CUDA time total: 19.458us @@ -4665,29 +4665,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 328.865us 1704.41% 328.865us 328.865us 1 - torch_eager 14.92% 117.622us 99.37% 783.184us 783.184us 0.000us 0.00% 21.439us 21.439us 1 - aten::to 0.74% 5.810us 66.49% 524.079us 87.347us 0.000us 0.00% 14.207us 2.368us 6 - aten::_to_copy 3.01% 23.701us 65.75% 518.269us 86.378us 0.000us 0.00% 14.207us 2.368us 6 - aten::copy_ 6.49% 51.190us 58.71% 462.718us 77.120us 12.063us 62.52% 14.207us 2.368us 6 - aten::conv1d 0.75% 5.890us 14.60% 115.093us 38.364us 0.000us 0.00% 7.232us 2.411us 3 - aten::convolution 1.22% 9.630us 13.86% 109.203us 36.401us 0.000us 0.00% 7.232us 2.411us 3 - aten::_convolution 2.83% 22.270us 12.63% 99.573us 33.191us 0.000us 0.00% 7.232us 2.411us 3 - aten::_conv_depthwise2d 2.80% 22.070us 7.82% 61.673us 20.558us 7.232us 37.48% 7.232us 2.411us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.232us 37.48% 7.232us 2.411us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.240us 32.34% 6.240us 2.080us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.823us 30.18% 5.823us 1.941us 3 - Activity Buffer Request 29.70% 234.095us 29.70% 234.095us 234.095us 2.144us 11.11% 2.144us 2.144us 1 - aten::empty_strided 4.04% 31.850us 4.04% 31.850us 5.308us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 25.25% 199.015us 25.25% 199.015us 22.113us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 2.15% 16.950us 2.78% 21.920us 2.436us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 1.05% 8.280us 1.05% 8.280us 0.552us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.22% 9.600us 1.22% 9.600us 3.200us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 1.07% 8.421us 1.07% 8.421us 2.807us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.76% 5.960us 0.92% 7.270us 2.423us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 340.862us 1751.78% 340.862us 340.862us 1 + torch_eager 8.44% 173.073us 99.74% 2.045ms 2.045ms 0.000us 0.00% 21.635us 21.635us 1 + aten::to 0.33% 6.670us 84.06% 1.723ms 287.196us 0.000us 0.00% 14.307us 2.385us 6 + aten::_to_copy 1.21% 24.883us 83.74% 1.717ms 286.084us 0.000us 0.00% 14.307us 2.385us 6 + aten::copy_ 2.36% 48.471us 81.06% 1.662ms 276.949us 12.130us 62.34% 14.307us 2.385us 6 + aten::conv1d 0.29% 5.970us 5.84% 119.613us 39.871us 0.000us 0.00% 7.328us 2.443us 3 + aten::convolution 0.48% 9.780us 5.54% 113.643us 37.881us 0.000us 0.00% 7.328us 2.443us 3 + aten::_convolution 1.14% 23.420us 5.07% 103.863us 34.621us 0.000us 0.00% 7.328us 2.443us 3 + aten::_conv_depthwise2d 1.10% 22.512us 3.15% 64.503us 21.501us 7.328us 37.66% 7.328us 2.443us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.328us 37.66% 7.328us 2.443us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.241us 32.07% 6.241us 2.080us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.889us 30.27% 5.889us 1.963us 3 + Activity Buffer Request 69.34% 1.421ms 69.34% 1.421ms 1.421ms 2.177us 11.19% 2.177us 2.177us 1 + aten::empty_strided 1.46% 29.930us 1.46% 29.930us 4.988us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 10.50% 215.256us 10.50% 215.256us 23.917us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.86% 17.669us 1.13% 23.180us 2.576us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.47% 9.581us 0.47% 9.581us 0.639us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.48% 9.759us 0.48% 9.759us 3.253us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.43% 8.742us 0.43% 8.742us 2.914us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.28% 5.760us 0.35% 7.110us 2.370us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 788.184us -Self CUDA time total: 19.295us +Self CPU time total: 2.050ms +Self CUDA time total: 19.458us @@ -4697,29 +4697,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 334.745us 1665.90% 334.745us 334.745us 1 - torch_eager 14.26% 118.712us 99.40% 827.395us 827.395us 0.000us 0.00% 22.270us 22.270us 1 - aten::to 0.70% 5.840us 67.41% 561.119us 93.520us 0.000us 0.00% 14.335us 2.389us 6 - aten::_to_copy 2.86% 23.780us 66.71% 555.279us 92.546us 0.000us 0.00% 14.335us 2.389us 6 - aten::copy_ 6.22% 51.741us 60.26% 501.588us 83.598us 12.159us 60.51% 14.335us 2.389us 6 - aten::conv1d 0.81% 6.751us 14.52% 120.873us 40.291us 0.000us 0.00% 7.935us 2.645us 3 - aten::convolution 1.20% 9.989us 13.71% 114.122us 38.041us 0.000us 0.00% 7.935us 2.645us 3 - aten::_convolution 2.78% 23.181us 12.51% 104.133us 34.711us 0.000us 0.00% 7.935us 2.645us 3 - aten::_conv_depthwise2d 2.64% 22.000us 7.72% 64.243us 21.414us 7.935us 39.49% 7.935us 2.645us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.935us 39.49% 7.935us 2.645us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.239us 31.05% 6.239us 2.080us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.920us 29.46% 5.920us 1.973us 3 - Activity Buffer Request 32.59% 271.245us 32.59% 271.245us 271.245us 2.176us 10.83% 2.176us 2.176us 1 - aten::empty_strided 3.59% 29.911us 3.59% 29.911us 4.985us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 24.22% 201.614us 24.22% 201.614us 22.402us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 2.06% 17.131us 2.68% 22.291us 2.477us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 1.07% 8.900us 1.07% 8.900us 0.593us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.16% 9.640us 1.16% 9.640us 3.213us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 1.15% 9.591us 1.15% 9.591us 3.197us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.79% 6.549us 0.97% 8.109us 2.703us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 367.067us 1820.95% 367.067us 367.067us 1 + torch_eager 17.50% 145.595us 99.30% 826.111us 826.111us 0.000us 0.00% 22.366us 22.366us 1 + aten::to 0.75% 6.199us 63.72% 530.082us 88.347us 0.000us 0.00% 14.431us 2.405us 6 + aten::_to_copy 2.95% 24.573us 62.97% 523.883us 87.314us 0.000us 0.00% 14.431us 2.405us 6 + aten::copy_ 6.31% 52.521us 56.15% 467.170us 77.862us 12.223us 60.64% 14.431us 2.405us 6 + aten::conv1d 0.69% 5.760us 14.59% 121.354us 40.451us 0.000us 0.00% 7.935us 2.645us 3 + aten::convolution 1.24% 10.281us 13.89% 115.594us 38.531us 0.000us 0.00% 7.935us 2.645us 3 + aten::_convolution 2.68% 22.269us 12.66% 105.313us 35.104us 0.000us 0.00% 7.935us 2.645us 3 + aten::_conv_depthwise2d 2.73% 22.701us 8.02% 66.711us 22.237us 7.935us 39.36% 7.935us 2.645us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.935us 39.36% 7.935us 2.645us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.304us 31.27% 6.304us 2.101us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.919us 29.36% 5.919us 1.973us 3 + Activity Buffer Request 27.00% 224.665us 27.00% 224.665us 224.665us 2.208us 10.95% 2.208us 2.208us 1 + aten::empty_strided 3.86% 32.140us 3.86% 32.140us 5.357us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 25.71% 213.894us 25.71% 213.894us 23.766us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 2.05% 17.041us 2.71% 22.553us 2.506us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 1.14% 9.503us 1.14% 9.503us 0.634us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.31% 10.920us 1.31% 10.920us 3.640us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 1.10% 9.180us 1.10% 9.180us 3.060us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.81% 6.740us 0.98% 8.160us 2.720us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 832.395us -Self CUDA time total: 20.094us +Self CPU time total: 831.951us +Self CUDA time total: 20.158us @@ -4729,29 +4729,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 330.142us 918.64% 330.142us 330.142us 1 - torch_eager 14.68% 120.212us 99.34% 813.674us 813.674us 0.000us 0.00% 38.530us 38.530us 1 - aten::conv1d 0.79% 6.500us 14.15% 115.923us 38.641us 0.000us 0.00% 20.161us 6.720us 3 - aten::convolution 1.18% 9.650us 13.36% 109.423us 36.474us 0.000us 0.00% 20.161us 6.720us 3 - aten::_convolution 2.75% 22.509us 12.18% 99.773us 33.258us 0.000us 0.00% 20.161us 6.720us 3 - aten::_conv_depthwise2d 2.55% 20.922us 7.56% 61.883us 20.628us 20.161us 56.10% 20.161us 6.720us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 20.161us 56.10% 20.161us 6.720us 3 - aten::to 0.72% 5.880us 67.15% 549.969us 91.661us 0.000us 0.00% 18.369us 3.061us 6 - aten::_to_copy 2.82% 23.099us 66.43% 544.089us 90.682us 0.000us 0.00% 18.369us 3.061us 6 - aten::copy_ 6.44% 52.723us 59.97% 491.160us 81.860us 15.777us 43.90% 18.369us 3.061us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.448us 23.51% 8.448us 2.816us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.329us 20.39% 7.329us 2.443us 3 - Activity Buffer Request 32.20% 263.764us 32.20% 263.764us 263.764us 2.592us 7.21% 2.592us 2.592us 1 - aten::empty_strided 3.64% 29.830us 3.64% 29.830us 4.972us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 24.00% 196.543us 24.00% 196.543us 21.838us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 2.14% 17.540us 2.77% 22.711us 2.523us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 1.07% 8.761us 1.07% 8.761us 0.584us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.21% 9.871us 1.21% 9.871us 3.290us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 1.13% 9.220us 1.13% 9.220us 3.073us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.68% 5.610us 0.85% 7.000us 2.333us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 363.100us 1005.93% 363.100us 363.100us 1 + torch_eager 14.77% 122.163us 99.35% 821.971us 821.971us 0.000us 0.00% 38.688us 38.688us 1 + aten::conv1d 0.72% 5.951us 17.29% 143.024us 47.675us 0.000us 0.00% 20.160us 6.720us 3 + aten::convolution 1.22% 10.110us 16.57% 137.073us 45.691us 0.000us 0.00% 20.160us 6.720us 3 + aten::_convolution 3.04% 25.151us 15.35% 126.963us 42.321us 0.000us 0.00% 20.160us 6.720us 3 + aten::_conv_depthwise2d 4.80% 39.711us 10.31% 85.271us 28.424us 20.160us 55.85% 20.160us 6.720us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 20.160us 55.85% 20.160us 6.720us 3 + aten::to 0.75% 6.172us 63.79% 527.804us 87.967us 0.000us 0.00% 18.528us 3.088us 6 + aten::_to_copy 2.99% 24.751us 63.05% 521.632us 86.939us 0.000us 0.00% 18.528us 3.088us 6 + aten::copy_ 6.14% 50.790us 56.45% 467.021us 77.837us 15.936us 44.15% 18.528us 3.088us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.512us 23.58% 8.512us 2.837us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.424us 20.57% 7.424us 2.475us 3 + Activity Buffer Request 27.93% 231.066us 27.93% 231.066us 231.066us 2.592us 7.18% 2.592us 2.592us 1 + aten::empty_strided 3.61% 29.860us 3.61% 29.860us 4.977us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 25.33% 209.585us 25.33% 209.585us 23.287us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 2.11% 17.441us 2.75% 22.791us 2.532us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 1.15% 9.501us 1.15% 9.501us 0.633us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.26% 10.400us 1.26% 10.400us 3.467us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 1.30% 10.740us 1.30% 10.740us 3.580us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.76% 6.269us 0.93% 7.730us 2.577us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 819.054us -Self CUDA time total: 35.938us +Self CPU time total: 827.381us +Self CUDA time total: 36.096us @@ -4761,29 +4761,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 330.946us 872.79% 330.946us 330.946us 1 - torch_eager 6.07% 120.841us 99.75% 1.987ms 1.987ms 0.000us 0.00% 40.478us 40.478us 1 - aten::conv1d 0.33% 6.510us 5.92% 117.833us 39.278us 0.000us 0.00% 22.271us 7.424us 3 - aten::convolution 0.49% 9.850us 5.59% 111.323us 37.108us 0.000us 0.00% 22.271us 7.424us 3 - aten::_convolution 1.11% 22.181us 5.10% 101.473us 33.824us 0.000us 0.00% 22.271us 7.424us 3 - aten::_conv_depthwise2d 1.10% 21.811us 3.17% 63.042us 21.014us 22.271us 58.73% 22.271us 7.424us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 22.271us 58.73% 22.271us 7.424us 3 - aten::to 0.30% 5.981us 86.38% 1.720ms 286.727us 0.000us 0.00% 18.207us 3.034us 6 - aten::_to_copy 1.18% 23.522us 86.08% 1.714ms 285.730us 0.000us 0.00% 18.207us 3.034us 6 - aten::copy_ 2.55% 50.829us 83.41% 1.661ms 276.860us 15.647us 41.27% 18.207us 3.034us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.320us 21.94% 8.320us 2.773us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.327us 19.32% 7.327us 2.442us 3 - Activity Buffer Request 72.02% 1.434ms 72.02% 1.434ms 1.434ms 2.560us 6.75% 2.560us 2.560us 1 - aten::empty_strided 1.49% 29.700us 1.49% 29.700us 4.950us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 9.93% 197.835us 9.93% 197.835us 21.982us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.90% 17.980us 1.17% 23.390us 2.599us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.44% 8.840us 0.44% 8.840us 0.589us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.50% 9.970us 0.50% 9.970us 3.323us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.47% 9.410us 0.47% 9.410us 3.137us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.31% 6.110us 0.38% 7.490us 2.497us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 336.025us 883.88% 336.025us 336.025us 1 + torch_eager 14.70% 120.902us 99.36% 817.351us 817.351us 0.000us 0.00% 40.610us 40.610us 1 + aten::conv1d 0.71% 5.820us 14.44% 118.823us 39.608us 0.000us 0.00% 22.304us 7.435us 3 + aten::convolution 1.12% 9.190us 13.74% 113.003us 37.668us 0.000us 0.00% 22.304us 7.435us 3 + aten::_convolution 2.83% 23.270us 12.62% 103.813us 34.604us 0.000us 0.00% 22.304us 7.435us 3 + aten::_conv_depthwise2d 2.83% 23.309us 7.79% 64.072us 21.357us 22.304us 58.67% 22.304us 7.435us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 22.304us 58.67% 22.304us 7.435us 3 + aten::to 0.73% 5.990us 66.75% 549.075us 91.513us 0.000us 0.00% 18.306us 3.051us 6 + aten::_to_copy 2.91% 23.953us 66.02% 543.085us 90.514us 0.000us 0.00% 18.306us 3.051us 6 + aten::copy_ 6.07% 49.902us 59.57% 490.042us 81.674us 15.713us 41.33% 18.306us 3.051us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.353us 21.97% 8.353us 2.784us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.360us 19.36% 7.360us 2.453us 3 + Activity Buffer Request 30.85% 253.806us 30.85% 253.806us 253.806us 2.593us 6.82% 2.593us 2.593us 1 + aten::empty_strided 3.54% 29.090us 3.54% 29.090us 4.848us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 25.29% 208.074us 25.29% 208.074us 23.119us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 2.19% 18.051us 2.84% 23.371us 2.597us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 1.11% 9.160us 1.11% 9.160us 0.611us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.21% 9.961us 1.21% 9.961us 3.320us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 1.10% 9.062us 1.10% 9.062us 3.021us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.80% 6.580us 0.96% 7.920us 2.640us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.992ms -Self CUDA time total: 37.918us +Self CPU time total: 822.611us +Self CUDA time total: 38.017us @@ -4793,29 +4793,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 385.308us 602.34% 385.308us 385.308us 1 - torch_eager 14.42% 123.450us 99.41% 851.045us 851.045us 0.000us 0.00% 68.065us 68.065us 1 - aten::conv1d 0.67% 5.711us 13.49% 115.513us 38.504us 0.000us 0.00% 41.633us 13.878us 3 - aten::convolution 1.22% 10.470us 12.83% 109.802us 36.601us 0.000us 0.00% 41.633us 13.878us 3 - aten::_convolution 2.63% 22.491us 11.60% 99.332us 33.111us 0.000us 0.00% 41.633us 13.878us 3 - aten::_conv_depthwise2d 2.49% 21.351us 7.22% 61.852us 20.617us 41.633us 65.08% 41.633us 13.878us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 41.633us 65.08% 41.633us 13.878us 3 - aten::to 0.71% 6.120us 68.08% 582.862us 97.144us 0.000us 0.00% 26.432us 4.405us 6 - aten::_to_copy 2.87% 24.611us 67.37% 576.742us 96.124us 0.000us 0.00% 26.432us 4.405us 6 - aten::copy_ 6.21% 53.173us 60.75% 520.070us 86.678us 22.336us 34.92% 26.432us 4.405us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 11.936us 18.66% 11.936us 3.979us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.400us 16.26% 10.400us 3.467us 3 - Activity Buffer Request 28.33% 242.554us 28.33% 242.554us 242.554us 4.096us 6.40% 4.096us 4.096us 1 - aten::empty_strided 3.74% 32.061us 3.74% 32.061us 5.344us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 28.79% 246.523us 28.79% 246.523us 27.391us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 2.02% 17.269us 2.63% 22.529us 2.503us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 1.08% 9.240us 1.08% 9.240us 0.616us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.11% 9.521us 1.11% 9.521us 3.174us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 1.03% 8.800us 1.03% 8.800us 2.933us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.68% 5.830us 0.84% 7.230us 2.410us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 335.486us 522.89% 335.486us 335.486us 1 + torch_eager 15.29% 123.163us 99.38% 800.491us 800.491us 0.000us 0.00% 68.256us 68.256us 1 + aten::conv1d 0.73% 5.840us 14.87% 119.763us 39.921us 0.000us 0.00% 41.760us 13.920us 3 + aten::convolution 1.21% 9.761us 14.14% 113.923us 37.974us 0.000us 0.00% 41.760us 13.920us 3 + aten::_convolution 2.84% 22.911us 12.93% 104.162us 34.721us 0.000us 0.00% 41.760us 13.920us 3 + aten::_conv_depthwise2d 2.80% 22.570us 8.02% 64.572us 21.524us 41.760us 65.09% 41.760us 13.920us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 41.760us 65.09% 41.760us 13.920us 3 + aten::to 0.73% 5.842us 65.67% 528.904us 88.151us 0.000us 0.00% 26.496us 4.416us 6 + aten::_to_copy 2.94% 23.712us 64.94% 523.062us 87.177us 0.000us 0.00% 26.496us 4.416us 6 + aten::copy_ 6.02% 48.492us 58.29% 469.521us 78.253us 22.400us 34.91% 26.496us 4.416us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 11.968us 18.65% 11.968us 3.989us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.432us 16.26% 10.432us 3.477us 3 + Activity Buffer Request 29.33% 236.206us 29.33% 236.206us 236.206us 4.096us 6.38% 4.096us 4.096us 1 + aten::empty_strided 3.70% 29.829us 3.70% 29.829us 4.971us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 25.91% 208.693us 25.91% 208.693us 23.188us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 2.18% 17.569us 2.86% 23.069us 2.563us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 1.14% 9.222us 1.14% 9.222us 0.615us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.20% 9.631us 1.20% 9.631us 3.210us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 1.06% 8.501us 1.06% 8.501us 2.834us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.83% 6.660us 0.99% 7.990us 2.663us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 856.136us -Self CUDA time total: 63.969us +Self CPU time total: 805.451us +Self CUDA time total: 64.160us @@ -4825,29 +4825,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 357.859us 513.70% 357.859us 357.859us 1 - torch_eager 20.53% 180.503us 99.40% 873.955us 873.955us 0.000us 0.00% 73.695us 73.695us 1 - aten::conv1d 0.63% 5.530us 15.78% 138.703us 46.234us 0.000us 0.00% 47.359us 15.786us 3 - aten::convolution 1.12% 9.840us 15.15% 133.173us 44.391us 0.000us 0.00% 47.359us 15.786us 3 - aten::_convolution 2.65% 23.331us 14.03% 123.333us 41.111us 0.000us 0.00% 47.359us 15.786us 3 - aten::_conv_depthwise2d 2.63% 23.161us 9.53% 83.782us 27.927us 47.359us 67.98% 47.359us 15.786us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 47.359us 67.98% 47.359us 15.786us 3 - aten::to 0.72% 6.308us 59.85% 526.239us 87.707us 0.000us 0.00% 26.336us 4.389us 6 - aten::_to_copy 2.80% 24.578us 59.14% 519.931us 86.655us 0.000us 0.00% 26.336us 4.389us 6 - aten::copy_ 6.12% 53.792us 52.84% 464.590us 77.432us 22.304us 32.02% 26.336us 4.389us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 11.840us 17.00% 11.840us 3.947us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.464us 15.02% 10.464us 3.488us 3 - Activity Buffer Request 26.53% 233.244us 26.53% 233.244us 233.244us 4.032us 5.79% 4.032us 4.032us 1 - aten::empty_strided 3.50% 30.763us 3.50% 30.763us 5.127us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 22.92% 201.494us 22.92% 201.494us 22.388us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 2.03% 17.891us 2.67% 23.440us 2.604us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 1.06% 9.339us 1.06% 9.339us 0.623us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 2.95% 25.971us 2.95% 25.971us 8.657us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 1.22% 10.710us 1.22% 10.710us 3.570us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.71% 6.240us 0.88% 7.780us 2.593us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 340.218us 487.48% 340.218us 340.218us 1 + torch_eager 15.18% 124.853us 99.38% 817.682us 817.682us 0.000us 0.00% 73.887us 73.887us 1 + aten::conv1d 0.72% 5.910us 14.57% 119.903us 39.968us 0.000us 0.00% 47.328us 15.776us 3 + aten::convolution 1.21% 9.960us 13.86% 113.993us 37.998us 0.000us 0.00% 47.328us 15.776us 3 + aten::_convolution 2.81% 23.101us 12.64% 104.033us 34.678us 0.000us 0.00% 47.328us 15.776us 3 + aten::_conv_depthwise2d 2.62% 21.561us 7.83% 64.432us 21.477us 47.328us 67.81% 47.328us 15.776us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 47.328us 67.81% 47.328us 15.776us 3 + aten::to 0.75% 6.180us 66.30% 545.475us 90.913us 0.000us 0.00% 26.559us 4.426us 6 + aten::_to_copy 2.97% 24.459us 65.55% 539.295us 89.882us 0.000us 0.00% 26.559us 4.426us 6 + aten::copy_ 6.14% 50.491us 58.93% 484.862us 80.810us 22.463us 32.19% 26.559us 4.426us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 12.032us 17.24% 12.032us 4.011us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.431us 14.95% 10.431us 3.477us 3 + Activity Buffer Request 30.21% 248.576us 30.21% 248.576us 248.576us 4.096us 5.87% 4.096us 4.096us 1 + aten::empty_strided 3.64% 29.974us 3.64% 29.974us 4.996us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 25.32% 208.345us 25.32% 208.345us 23.149us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 2.09% 17.201us 2.72% 22.401us 2.489us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 1.11% 9.120us 1.11% 9.120us 0.608us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.32% 10.899us 1.32% 10.899us 3.633us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 1.15% 9.422us 1.15% 9.422us 3.141us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.80% 6.580us 0.98% 8.070us 2.690us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 879.215us -Self CUDA time total: 69.663us +Self CPU time total: 822.752us +Self CUDA time total: 69.791us @@ -4857,29 +4857,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 365.250us 197.10% 365.250us 365.250us 1 - torch_eager 14.70% 119.032us 99.37% 804.604us 804.604us 0.000us 0.00% 195.299us 195.299us 1 - aten::conv1d 0.95% 7.700us 17.22% 139.393us 46.464us 0.000us 0.00% 133.056us 44.352us 3 - aten::convolution 1.24% 10.040us 16.26% 131.693us 43.898us 0.000us 0.00% 133.056us 44.352us 3 - aten::_convolution 2.91% 23.550us 15.02% 121.653us 40.551us 0.000us 0.00% 133.056us 44.352us 3 - aten::_conv_depthwise2d 2.69% 21.763us 10.08% 81.613us 27.204us 133.056us 71.80% 133.056us 44.352us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 133.056us 71.80% 133.056us 44.352us 3 - aten::to 0.75% 6.042us 64.10% 518.999us 86.500us 0.000us 0.00% 62.243us 10.374us 6 - aten::_to_copy 2.90% 23.470us 63.35% 512.957us 85.493us 0.000us 0.00% 62.243us 10.374us 6 - aten::copy_ 6.35% 51.412us 56.59% 458.237us 76.373us 52.258us 28.20% 62.243us 10.374us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 29.250us 15.78% 29.250us 9.750us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 23.008us 12.42% 23.008us 7.669us 3 - Activity Buffer Request 28.43% 230.213us 28.43% 230.213us 230.213us 9.985us 5.39% 9.985us 9.985us 1 - aten::empty_strided 3.86% 31.250us 3.86% 31.250us 5.208us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 26.81% 217.052us 26.81% 217.052us 24.117us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 2.10% 17.030us 2.74% 22.170us 2.463us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 1.13% 9.170us 1.13% 9.170us 0.611us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.22% 9.870us 1.22% 9.870us 3.290us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 1.18% 9.540us 1.18% 9.540us 3.180us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.78% 6.320us 1.00% 8.100us 2.700us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 357.276us 192.10% 357.276us 357.276us 1 + torch_eager 7.25% 148.445us 99.75% 2.043ms 2.043ms 0.000us 0.00% 196.063us 196.063us 1 + aten::conv1d 0.28% 5.714us 6.04% 123.725us 41.242us 0.000us 0.00% 133.535us 44.512us 3 + aten::convolution 0.50% 10.209us 5.76% 118.011us 39.337us 0.000us 0.00% 133.535us 44.512us 3 + aten::_convolution 1.22% 24.922us 5.26% 107.802us 35.934us 0.000us 0.00% 133.535us 44.512us 3 + aten::_conv_depthwise2d 1.06% 21.740us 3.25% 66.540us 22.180us 133.535us 71.80% 133.535us 44.512us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 133.535us 71.80% 133.535us 44.512us 3 + aten::to 0.32% 6.558us 85.01% 1.741ms 290.215us 0.000us 0.00% 62.528us 10.421us 6 + aten::_to_copy 1.28% 26.242us 84.69% 1.735ms 289.122us 0.000us 0.00% 62.528us 10.421us 6 + aten::copy_ 2.37% 48.539us 81.91% 1.678ms 279.634us 52.448us 28.20% 62.528us 10.421us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 29.536us 15.88% 29.536us 9.845us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.912us 12.32% 22.912us 7.637us 3 + Activity Buffer Request 70.45% 1.443ms 70.45% 1.443ms 1.443ms 10.080us 5.42% 10.080us 10.080us 1 + aten::empty_strided 1.50% 30.691us 1.50% 30.691us 5.115us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 10.22% 209.265us 10.22% 209.265us 23.252us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.93% 19.072us 1.20% 24.640us 2.738us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.45% 9.247us 0.45% 9.247us 0.616us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.55% 11.270us 0.55% 11.270us 3.757us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.51% 10.520us 0.51% 10.520us 3.507us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.29% 5.931us 0.35% 7.230us 2.410us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 809.694us -Self CUDA time total: 185.314us +Self CPU time total: 2.048ms +Self CUDA time total: 185.983us @@ -4889,29 +4889,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 352.824us 168.80% 352.824us 352.824us 1 - torch_eager 14.40% 121.160us 99.40% 836.424us 836.424us 0.000us 0.00% 222.266us 222.266us 1 - aten::conv1d 0.71% 5.981us 14.17% 119.243us 39.748us 0.000us 0.00% 153.724us 51.241us 3 - aten::convolution 1.17% 9.810us 13.46% 113.262us 37.754us 0.000us 0.00% 153.724us 51.241us 3 - aten::_convolution 2.76% 23.250us 12.29% 103.452us 34.484us 0.000us 0.00% 153.724us 51.241us 3 - aten::_conv_depthwise2d 2.65% 22.340us 7.64% 64.321us 21.440us 153.724us 73.55% 153.724us 51.241us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 153.724us 73.55% 153.724us 51.241us 3 - aten::to 0.70% 5.880us 67.58% 568.691us 94.782us 0.000us 0.00% 68.542us 11.424us 6 - aten::_to_copy 2.81% 23.631us 66.88% 562.811us 93.802us 0.000us 0.00% 68.542us 11.424us 6 - aten::copy_ 7.48% 62.921us 60.21% 506.640us 84.440us 55.294us 26.45% 68.542us 11.424us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 32.352us 15.48% 32.352us 10.784us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.942us 10.98% 22.942us 7.647us 3 - Activity Buffer Request 31.88% 268.245us 31.88% 268.245us 268.245us 13.248us 6.34% 13.248us 13.248us 1 - aten::empty_strided 3.87% 32.540us 3.87% 32.540us 5.423us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 23.51% 197.824us 23.51% 197.824us 21.980us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 2.07% 17.378us 2.68% 22.521us 2.502us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 1.06% 8.883us 1.06% 8.883us 0.592us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.19% 9.991us 1.19% 9.991us 3.330us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 1.15% 9.640us 1.15% 9.640us 3.213us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.71% 5.990us 0.89% 7.470us 2.490us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 358.235us 170.21% 358.235us 358.235us 1 + torch_eager 15.50% 124.275us 99.34% 796.461us 796.461us 0.000us 0.00% 224.253us 224.253us 1 + aten::conv1d 0.70% 5.590us 14.78% 118.483us 39.494us 0.000us 0.00% 154.174us 51.391us 3 + aten::convolution 1.24% 9.921us 14.08% 112.893us 37.631us 0.000us 0.00% 154.174us 51.391us 3 + aten::_convolution 2.81% 22.549us 12.84% 102.972us 34.324us 0.000us 0.00% 154.174us 51.391us 3 + aten::_conv_depthwise2d 2.82% 22.632us 8.11% 65.062us 21.687us 154.174us 73.26% 154.174us 51.391us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 154.174us 73.26% 154.174us 51.391us 3 + aten::to 0.74% 5.971us 65.46% 524.833us 87.472us 0.000us 0.00% 70.079us 11.680us 6 + aten::_to_copy 3.23% 25.880us 64.72% 518.862us 86.477us 0.000us 0.00% 70.079us 11.680us 6 + aten::copy_ 6.33% 50.713us 57.67% 462.401us 77.067us 56.287us 26.74% 70.079us 11.680us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 33.248us 15.80% 33.248us 11.083us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 23.039us 10.95% 23.039us 7.680us 3 + Activity Buffer Request 28.19% 225.995us 28.19% 225.995us 225.995us 13.792us 6.55% 13.792us 13.792us 1 + aten::empty_strided 3.81% 30.581us 3.81% 30.581us 5.097us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 25.98% 208.263us 25.98% 208.263us 23.140us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 2.24% 17.992us 2.91% 23.301us 2.589us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 1.16% 9.309us 1.16% 9.309us 0.621us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.31% 10.480us 1.31% 10.480us 3.493us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 1.17% 9.380us 1.17% 9.380us 3.127us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.74% 5.910us 0.92% 7.370us 2.457us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 841.495us -Self CUDA time total: 209.018us +Self CPU time total: 801.751us +Self CUDA time total: 210.461us @@ -4921,29 +4921,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 6.78% 125.712us 53.74% 996.387us 996.387us 0.000us 0.00% 1.527ms 1.527ms 1 - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.423ms 100.39% 1.423ms 1.423ms 1 - aten::to 0.35% 6.438us 38.84% 720.182us 120.030us 0.000us 0.00% 832.992us 138.832us 6 - aten::_to_copy 1.55% 28.691us 38.49% 713.744us 118.957us 0.000us 0.00% 832.992us 138.832us 6 - aten::copy_ 2.90% 53.742us 26.33% 488.279us 81.380us 724.000us 51.06% 832.992us 138.832us 6 - aten::conv1d 0.38% 6.960us 6.55% 121.533us 40.511us 0.000us 0.00% 693.950us 231.317us 3 - aten::convolution 0.56% 10.430us 6.18% 114.573us 38.191us 0.000us 0.00% 693.950us 231.317us 3 - aten::_convolution 1.25% 23.268us 5.62% 104.143us 34.714us 0.000us 0.00% 693.950us 231.317us 3 - aten::_conv_depthwise2d 1.23% 22.830us 3.48% 64.552us 21.517us 693.950us 48.94% 693.950us 231.317us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 693.950us 48.94% 693.950us 231.317us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 410.655us 28.96% 410.655us 136.885us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 313.345us 22.10% 313.345us 104.448us 3 - Activity Buffer Request 13.73% 254.654us 13.73% 254.654us 254.654us 108.992us 7.69% 108.992us 108.992us 1 - aten::empty_strided 2.01% 37.271us 10.61% 196.774us 32.796us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 10.89% 201.884us 10.89% 201.884us 22.432us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.98% 18.223us 1.29% 23.933us 2.659us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.51% 9.490us 0.51% 9.490us 0.633us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.54% 10.101us 0.54% 10.101us 3.367us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.52% 9.620us 0.52% 9.620us 3.207us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.34% 6.270us 0.41% 7.680us 2.560us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 7.15% 131.473us 52.77% 970.085us 970.085us 0.000us 0.00% 1.521ms 1.521ms 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.421ms 100.40% 1.421ms 1.421ms 1 + aten::to 0.36% 6.571us 37.17% 683.219us 113.870us 0.000us 0.00% 824.180us 137.363us 6 + aten::_to_copy 1.61% 29.612us 36.81% 676.648us 112.775us 0.000us 0.00% 824.180us 137.363us 6 + aten::copy_ 2.81% 51.569us 25.14% 462.051us 77.009us 718.613us 50.76% 824.180us 137.363us 6 + aten::conv1d 0.36% 6.680us 6.82% 125.423us 41.808us 0.000us 0.00% 696.981us 232.327us 3 + aten::convolution 0.57% 10.460us 6.46% 118.743us 39.581us 0.000us 0.00% 696.981us 232.327us 3 + aten::_convolution 1.31% 24.040us 5.89% 108.283us 36.094us 0.000us 0.00% 696.981us 232.327us 3 + aten::_conv_depthwise2d 1.25% 22.981us 3.69% 67.913us 22.638us 696.981us 49.24% 696.981us 232.327us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 696.981us 49.24% 696.981us 232.327us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 410.458us 29.00% 410.458us 136.819us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 308.155us 21.77% 308.155us 102.718us 3 + Activity Buffer Request 11.91% 218.936us 11.91% 218.936us 218.936us 105.567us 7.46% 105.567us 105.567us 1 + aten::empty_strided 2.01% 37.011us 10.06% 184.985us 30.831us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 11.74% 215.777us 11.74% 215.777us 23.975us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.99% 18.200us 1.31% 24.000us 2.667us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.53% 9.740us 0.53% 9.740us 0.649us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.59% 10.839us 0.59% 10.839us 3.613us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.54% 9.862us 0.54% 9.862us 3.287us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.34% 6.240us 0.42% 7.700us 2.567us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.854ms -Self CUDA time total: 1.418ms +Self CPU time total: 1.838ms +Self CUDA time total: 1.416ms @@ -4953,109 +4953,57 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 4.03% 122.972us 65.43% 1.999ms 1.999ms 0.000us 0.00% 1.502ms 1.502ms 1 + torch_eager 6.74% 124.615us 43.66% 806.720us 806.720us 0.000us 0.00% 1.502ms 1.502ms 1 torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.433ms 100.41% 1.433ms 1.433ms 1 - aten::to 0.19% 5.740us 56.63% 1.730ms 288.331us 0.000us 0.00% 766.432us 127.739us 6 - aten::_to_copy 0.79% 24.119us 56.45% 1.724ms 287.375us 0.000us 0.00% 766.432us 127.739us 6 - aten::copy_ 1.70% 52.020us 54.70% 1.671ms 278.493us 691.168us 48.43% 766.432us 127.739us 6 - aten::conv1d 0.23% 6.891us 3.86% 118.002us 39.334us 0.000us 0.00% 736.031us 245.344us 3 - aten::convolution 0.33% 9.930us 3.64% 111.111us 37.037us 0.000us 0.00% 736.031us 245.344us 3 - aten::_convolution 0.74% 22.558us 3.31% 101.181us 33.727us 0.000us 0.00% 736.031us 245.344us 3 - aten::_conv_depthwise2d 0.70% 21.291us 2.07% 63.232us 21.077us 736.031us 51.57% 736.031us 245.344us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 736.031us 51.57% 736.031us 245.344us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 401.120us 28.11% 401.120us 133.707us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 290.048us 20.32% 290.048us 96.683us 3 - Activity Buffer Request 47.17% 1.441ms 47.17% 1.441ms 1.441ms 75.264us 5.27% 75.264us 75.264us 1 - aten::empty_strided 0.95% 29.171us 0.95% 29.171us 4.862us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 6.58% 201.084us 6.58% 201.084us 22.343us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.57% 17.550us 0.75% 22.971us 2.552us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.30% 9.131us 0.30% 9.131us 0.609us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.33% 9.960us 0.33% 9.960us 3.320us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.30% 9.060us 0.30% 9.060us 3.020us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.18% 5.561us 0.23% 7.041us 2.347us 0.000us 0.00% 0.000us 0.000us 3 + aten::to 0.34% 6.269us 28.35% 523.751us 87.292us 0.000us 0.00% 764.786us 127.464us 6 + aten::_to_copy 1.27% 23.480us 28.01% 517.482us 86.247us 0.000us 0.00% 764.786us 127.464us 6 + aten::copy_ 2.74% 50.661us 25.15% 464.712us 77.452us 690.099us 48.36% 764.786us 127.464us 6 + aten::conv1d 0.32% 5.870us 7.00% 129.374us 43.125us 0.000us 0.00% 737.040us 245.680us 3 + aten::convolution 0.54% 9.999us 6.68% 123.504us 41.168us 0.000us 0.00% 737.040us 245.680us 3 + aten::_convolution 1.31% 24.293us 6.14% 113.505us 37.835us 0.000us 0.00% 737.040us 245.680us 3 + aten::_conv_depthwise2d 1.62% 30.010us 3.95% 73.060us 24.353us 737.040us 51.64% 737.040us 245.680us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 737.040us 51.64% 737.040us 245.680us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 399.673us 28.01% 399.673us 133.224us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 290.426us 20.35% 290.426us 96.809us 3 + Activity Buffer Request 12.15% 224.466us 12.15% 224.466us 224.466us 74.687us 5.23% 74.687us 74.687us 1 + aten::empty_strided 1.59% 29.290us 1.59% 29.290us 4.882us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 11.52% 212.785us 11.52% 212.785us 23.643us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.94% 17.281us 1.23% 22.771us 2.530us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.55% 10.081us 0.55% 10.081us 0.672us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.57% 10.440us 0.57% 10.440us 3.480us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.51% 9.410us 0.51% 9.410us 3.137us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.33% 6.150us 0.41% 7.641us 2.547us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 3.055ms +Self CPU time total: 1.848ms Self CUDA time total: 1.427ms impl wl p50(ms) ok torch_eager cuda_B2_D2048_S128_W2 0.09 True torch_eager cuda_B2_D2048_S128_W4 0.08 True -torch_eager cuda_B2_D2048_S2048_W2 0.14 True +torch_eager cuda_B2_D2048_S2048_W2 0.15 True torch_eager cuda_B2_D2048_S2048_W4 0.16 True torch_eager cuda_B2_D2048_S512_W2 0.09 True -torch_eager cuda_B2_D2048_S512_W4 0.08 True +torch_eager cuda_B2_D2048_S512_W4 0.09 True torch_eager cuda_B2_D64_S128_W2 0.07 True torch_eager cuda_B2_D64_S128_W4 0.09 True torch_eager cuda_B2_D64_S2048_W2 0.09 True -torch_eager cuda_B2_D64_S2048_W4 0.08 True +torch_eager cuda_B2_D64_S2048_W4 0.09 True torch_eager cuda_B2_D64_S512_W2 0.09 True torch_eager cuda_B2_D64_S512_W4 0.09 True -torch_eager cuda_B4_D2048_S128_W2 0.08 True -torch_eager cuda_B4_D2048_S128_W4 0.08 True +torch_eager cuda_B4_D2048_S128_W2 0.09 True +torch_eager cuda_B4_D2048_S128_W4 0.09 True torch_eager cuda_B4_D2048_S2048_W2 0.49 True torch_eager cuda_B4_D2048_S2048_W4 0.50 True -torch_eager cuda_B4_D2048_S512_W2 0.09 True +torch_eager cuda_B4_D2048_S512_W2 0.10 True torch_eager cuda_B4_D2048_S512_W4 0.10 True -torch_eager cuda_B4_D64_S128_W2 0.08 True +torch_eager cuda_B4_D64_S128_W2 0.09 True torch_eager cuda_B4_D64_S128_W4 0.08 True -torch_eager cuda_B4_D64_S2048_W2 0.08 True -torch_eager cuda_B4_D64_S2048_W4 0.08 True -torch_eager cuda_B4_D64_S512_W2 0.08 True -torch_eager cuda_B4_D64_S512_W4 0.08 True +torch_eager cuda_B4_D64_S2048_W2 0.09 True +torch_eager cuda_B4_D64_S2048_W4 0.09 True +torch_eager cuda_B4_D64_S512_W2 0.09 True +torch_eager cuda_B4_D64_S512_W4 0.09 True
-
-
▶ UV Install Logs
- -

Artifacts:

causal_conv1d.jsonl diff --git a/causal_conv1d/results/artifacts/combine/latency.svg b/causal_conv1d/results/artifacts/combine/latency.svg index 9b058d2666ce3f17f1e0271794e89c52b55a50d5..1051764b171c27ddd8f8651b286d107eb666bd69 100644 --- a/causal_conv1d/results/artifacts/combine/latency.svg +++ b/causal_conv1d/results/artifacts/combine/latency.svg @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cf8858bb054bd7e8f82af77fd05a6475b7ee3a9a335ba4a6506cd1c694804777 +oid sha256:6fdf61512b0add92f3d8e4a284ecb814f7a3b11b2db0fe3af610896a05d7072f size 35426 diff --git a/causal_conv1d/results/combined_results.html b/causal_conv1d/results/combined_results.html index 478077209c7e2fef5044dc68f9a6ef240e0167c9..6a99b42f98995858e618176be6ad4beb1b59c2c4 100644 --- a/causal_conv1d/results/combined_results.html +++ b/causal_conv1d/results/combined_results.html @@ -4107,7 +4107,7 @@ body[data-tool="eraser"] .main-content { - 2025-10-30T15:53:58.349427 + 2025-10-31T20:14:05.716143 image/svg+xml @@ -4451,70 +4451,70 @@ body[data-tool="eraser"] .main-content { - + - + - 0.1 + 0.1 - + - + - 0.2 + 0.2 - + - + - 0.3 + 0.3 - + - + - 0.4 + 0.4 - + - + - 0.5 + 0.5 @@ -4522,66 +4522,66 @@ body[data-tool="eraser"] .main-content { - + - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + - + - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + @@ -4640,7 +4640,7 @@ body[data-tool="eraser"] .main-content { ▼ output ▶ uv-logs | -Cell: combine | 4.38s +Cell: combine | 4.43s | Raw @@ -4753,28 +4753,28 @@ hf_kernels_causal_conv1d cuda_B4_D64_S512_W2 0.05 True hf_kernels_causal_conv1d cuda_B4_D64_S512_W4 0.05 True torch_eager cuda_B2_D2048_S128_W2 0.09 True torch_eager cuda_B2_D2048_S128_W4 0.08 True -torch_eager cuda_B2_D2048_S2048_W2 0.14 True +torch_eager cuda_B2_D2048_S2048_W2 0.15 True torch_eager cuda_B2_D2048_S2048_W4 0.16 True torch_eager cuda_B2_D2048_S512_W2 0.09 True -torch_eager cuda_B2_D2048_S512_W4 0.08 True +torch_eager cuda_B2_D2048_S512_W4 0.09 True torch_eager cuda_B2_D64_S128_W2 0.07 True torch_eager cuda_B2_D64_S128_W4 0.09 True torch_eager cuda_B2_D64_S2048_W2 0.09 True -torch_eager cuda_B2_D64_S2048_W4 0.08 True +torch_eager cuda_B2_D64_S2048_W4 0.09 True torch_eager cuda_B2_D64_S512_W2 0.09 True torch_eager cuda_B2_D64_S512_W4 0.09 True -torch_eager cuda_B4_D2048_S128_W2 0.08 True -torch_eager cuda_B4_D2048_S128_W4 0.08 True +torch_eager cuda_B4_D2048_S128_W2 0.09 True +torch_eager cuda_B4_D2048_S128_W4 0.09 True torch_eager cuda_B4_D2048_S2048_W2 0.49 True torch_eager cuda_B4_D2048_S2048_W4 0.50 True -torch_eager cuda_B4_D2048_S512_W2 0.09 True +torch_eager cuda_B4_D2048_S512_W2 0.10 True torch_eager cuda_B4_D2048_S512_W4 0.10 True -torch_eager cuda_B4_D64_S128_W2 0.08 True +torch_eager cuda_B4_D64_S128_W2 0.09 True torch_eager cuda_B4_D64_S128_W4 0.08 True -torch_eager cuda_B4_D64_S2048_W2 0.08 True -torch_eager cuda_B4_D64_S2048_W4 0.08 True -torch_eager cuda_B4_D64_S512_W2 0.08 True -torch_eager cuda_B4_D64_S512_W4 0.08 True +torch_eager cuda_B4_D64_S2048_W2 0.09 True +torch_eager cuda_B4_D64_S2048_W4 0.09 True +torch_eager cuda_B4_D64_S512_W2 0.09 True +torch_eager cuda_B4_D64_S512_W4 0.09 True GENERATING COMBINED VISUALIZATION @@ -4794,7 +4794,7 @@ Implementations included:
▶ UV Install Logs
@@ -4807,7 +4807,7 @@ Installed 37 packages in 211ms - 2025-10-30T15:53:58.349427 + 2025-10-31T20:14:05.716143 image/svg+xml @@ -5151,70 +5151,70 @@ Installed 37 packages in 211ms - + - + - 0.1 + 0.1 - + - + - 0.2 + 0.2 - + - + - 0.3 + 0.3 - + - + - 0.4 + 0.4 - + - + - 0.5 + 0.5 @@ -5222,66 +5222,66 @@ Installed 37 packages in 211ms - + - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + - + - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deformable_detr/impls/artifacts/benchmark/deformable_detr.jsonl b/deformable_detr/impls/artifacts/benchmark/deformable_detr.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..52c7930d88f40dd4da2a4cc2aa3b8068bb350deb --- /dev/null +++ b/deformable_detr/impls/artifacts/benchmark/deformable_detr.jsonl @@ -0,0 +1,4 @@ +{"ts": "2025-10-31T20:13:50Z", "run": "c1c013d99d9f4c1199d0a550b8476fb2", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_Q100_H8_E256_L4_P4", "batch_size": 1, "num_queries": 100, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.3733269999629556, "p50": 3.3932979999917734, "p90": 3.4002180000243243, "mean": 3.393551400040451, "iqr": 0.010580999969533877, "raw_times": [3.3896370000547904, 3.4002180000243243, 3.3932979999917734, 3.3733269999629556, 3.411277000168411], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.4049870000671945, "peak_bytes": 5929472, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "deformable_detr_torch"}, "err": null} +{"ts": "2025-10-31T20:13:51Z", "run": "c1c013d99d9f4c1199d0a550b8476fb2", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_Q300_H8_E256_L4_P4", "batch_size": 1, "num_queries": 300, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.99112300010529, "p50": 4.007804000139004, "p90": 4.020502999992459, "mean": 4.014501400024528, "iqr": 0.017490000118414173, "raw_times": [4.050064000011844, 4.020502999992459, 4.007804000139004, 4.003012999874045, 3.99112300010529], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 4.017783999870517, "peak_bytes": 15161856, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "deformable_detr_torch"}, "err": null} +{"ts": "2025-10-31T20:13:51Z", "run": "c1c013d99d9f4c1199d0a550b8476fb2", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_Q100_H8_E256_L4_P4", "batch_size": 2, "num_queries": 100, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 4.004662999932407, "p50": 4.020202999981848, "p90": 4.030714000009539, "mean": 4.022331200030749, "iqr": 0.011850999953821884, "raw_times": [4.018863000055717, 4.004662999932407, 4.0372130001742335, 4.020202999981848, 4.030714000009539], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 4.032904000041526, "peak_bytes": 11958784, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "deformable_detr_torch"}, "err": null} +{"ts": "2025-10-31T20:13:52Z", "run": "c1c013d99d9f4c1199d0a550b8476fb2", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_Q300_H8_E256_L4_P4", "batch_size": 2, "num_queries": 300, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 4.005022999990615, "p50": 4.020072999992408, "p90": 4.0240040000298904, "mean": 4.01746140000796, "iqr": 0.009850999958871398, "raw_times": [4.014153000071019, 4.005022999990615, 4.024053999955868, 4.0240040000298904, 4.020072999992408], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 4.024974000003567, "peak_bytes": 30977024, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "deformable_detr_torch"}, "err": null} diff --git a/deformable_detr/impls/cells/benchmark.py b/deformable_detr/impls/cells/benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..66ccdf2085524240060089c8658a5256c484037b --- /dev/null +++ b/deformable_detr/impls/cells/benchmark.py @@ -0,0 +1,118 @@ +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "numpy", +# "torch==2.8.0", +# "kernels-benchmark-tools", +# ] +# +# [tool.uv.sources] +# kernels-benchmark-tools = { path = "../../../../../tools", editable = true } +# /// +import torch +import sys +from kernels_benchmark_tools import KernelTypeEnum, run_benchmark + + +def torch_deformable_detr( + value, spatial_shapes, level_start_index, sampling_locations, attention_weights, im2col_step=64 +): + """ + PyTorch native reference implementation of multi-scale deformable attention. + Uses vectorized bilinear interpolation for reasonable performance. + """ + bs, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape + _, _, _, channels = value.shape + + output = torch.zeros(bs, num_queries, num_heads, channels, device=value.device, dtype=value.dtype) + + # Split value tensor by levels + value_list = value.split([int(h * w) for h, w in spatial_shapes.tolist()], dim=1) + + # Iterate through each level (can't avoid this loop easily) + for level_idx in range(num_levels): + h, w = spatial_shapes[level_idx].tolist() + value_level = value_list[level_idx] # (bs, h*w, num_heads, channels) + + # Reshape to spatial grid: (bs, num_heads, channels, h, w) + value_spatial = value_level.reshape(bs, h, w, num_heads, channels).permute(0, 3, 4, 1, 2) + + # Get sampling locations and weights for this level + # loc: (bs, num_queries, num_heads, num_points, 2) + loc = sampling_locations[:, :, :, level_idx, :, :] + # weight: (bs, num_queries, num_heads, num_points) + weight = attention_weights[:, :, :, level_idx, :] + + # Convert normalized coordinates to pixel coordinates + # loc[..., 0] is x (width), loc[..., 1] is y (height) + x = loc[..., 0] * w - 0.5 # (bs, num_queries, num_heads, num_points) + y = loc[..., 1] * h - 0.5 + + # Get integer coordinates for bilinear interpolation + x0 = torch.floor(x).long() + y0 = torch.floor(y).long() + x1 = x0 + 1 + y1 = y0 + 1 + + # Compute interpolation weights BEFORE clamping (important!) + lw = x - x0.float() # weight for x direction + lh = y - y0.float() # weight for y direction + hw = 1 - lw + hh = 1 - lh + + # Create mask for valid sample locations + valid = (y > -1) & (x > -1) & (y < h) & (x < w) + + # Create masks for each corner being in bounds + mask_tl = ((y0 >= 0) & (x0 >= 0)).unsqueeze(-1).float() + mask_tr = ((y0 >= 0) & (x1 <= w - 1)).unsqueeze(-1).float() + mask_bl = ((y1 <= h - 1) & (x0 >= 0)).unsqueeze(-1).float() + mask_br = ((y1 <= h - 1) & (x1 <= w - 1)).unsqueeze(-1).float() + + # Clamp coordinates for safe indexing + x0_clamped = torch.clamp(x0, 0, w - 1) + x1_clamped = torch.clamp(x1, 0, w - 1) + y0_clamped = torch.clamp(y0, 0, h - 1) + y1_clamped = torch.clamp(y1, 0, h - 1) + + # Bilinear interpolation weights for all 4 corners + w_tl = (hh * hw).unsqueeze(-1) # top-left: (bs, num_queries, num_heads, num_points, 1) + w_tr = (hh * lw).unsqueeze(-1) # top-right + w_bl = (lh * hw).unsqueeze(-1) # bottom-left + w_br = (lh * lw).unsqueeze(-1) # bottom-right + + # Gather values from the 4 corners using advanced indexing + batch_idx = torch.arange(bs, device=value.device).view(bs, 1, 1, 1).expand(bs, num_queries, num_heads, num_points) + head_idx = torch.arange(num_heads, device=value.device).view(1, 1, num_heads, 1).expand(bs, num_queries, num_heads, num_points) + + # Gather corner values with clamped indices, then apply corner masks + v_tl = value_spatial[batch_idx, head_idx, :, y0_clamped, x0_clamped] * mask_tl + v_tr = value_spatial[batch_idx, head_idx, :, y0_clamped, x1_clamped] * mask_tr + v_bl = value_spatial[batch_idx, head_idx, :, y1_clamped, x0_clamped] * mask_bl + v_br = value_spatial[batch_idx, head_idx, :, y1_clamped, x1_clamped] * mask_br + + # Bilinear interpolation + sampled = w_tl * v_tl + w_tr * v_tr + w_bl * v_bl + w_br * v_br + + # Apply valid mask (only accumulate if entire sample location is valid) + sampled = sampled * valid.unsqueeze(-1).float() + + # Apply attention weights and sum over points + # weight: (bs, num_queries, num_heads, num_points) + # Expand weight: (bs, num_queries, num_heads, num_points, 1) + weighted_sampled = sampled * weight.unsqueeze(-1) + + # Sum over points: (bs, num_queries, num_heads, channels) + output += weighted_sampled.sum(dim=3) + + # Flatten last two dimensions to match kernel output + return output.reshape(bs, num_queries, num_heads * channels) + + +run_benchmark( + kernel_type=KernelTypeEnum.DEFORMABLE_DETR, + impl_name="torch_eager", + impl_tags={"family": "pytorch", "backend": "eager"}, + impl_func=torch_deformable_detr, + dtype="float32", +) \ No newline at end of file diff --git a/deformable_detr/impls/cells/nv.py b/deformable_detr/impls/cells/nv.py new file mode 100644 index 0000000000000000000000000000000000000000..c9b2d5c1f9f3850dc1d3092240fd46853e7ed0c5 --- /dev/null +++ b/deformable_detr/impls/cells/nv.py @@ -0,0 +1,2 @@ +import subprocess +print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout) \ No newline at end of file diff --git a/deformable_detr/impls/hf_kernels_deformable_detr.html b/deformable_detr/impls/hf_kernels_deformable_detr.html new file mode 100644 index 0000000000000000000000000000000000000000..8203846442acfc0a17b0a7372d2971964aac9caf --- /dev/null +++ b/deformable_detr/impls/hf_kernels_deformable_detr.html @@ -0,0 +1,4350 @@ + + + + + + hf_kernels_deformable_detr + + + + + + + + + + + + +
+
+ + ← back + +
light
+
reset
+ +
+
+ +
+
Generated on:
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35 +
+
+ +
+

HF Kernels - Deformable DETR

+

GPU Info

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: nv | 0.23s + | + +Raw +GitHub +🤗 HF +
+
+
+
import subprocess
+print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
+
+ +
+
+
+
+
Fri Oct 31 20:13:34 2025       
++-----------------------------------------------------------------------------------------+
+| NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
+|-----------------------------------------+------------------------+----------------------+
+| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
+| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
+|                                         |                        |               MIG M. |
+|=========================================+========================+======================|
+|   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
+| N/A   43C    P0             83W /  350W |       0MiB /  46068MiB |     60%      Default |
+|                                         |                        |                  N/A |
++-----------------------------------------+------------------------+----------------------+
+
++-----------------------------------------------------------------------------------------+
+| Processes:                                                                              |
+|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
+|        ID   ID                                                               Usage      |
+|=========================================================================================|
+|  No running processes found                                                             |
++-----------------------------------------------------------------------------------------+
+
+
+
+
+ +

Deformable DETR Multi-Scale Deformable Attention Benchmark

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: benchmark | 8.30s + | + +Raw +GitHub +🤗 HF +
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "numpy",
+#     "torch==2.8.0",
+#     "kernels-benchmark-tools",
+#     "kernels",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+from kernels import get_kernel
+
+# Load the deformable DETR kernel
+deformable_detr = get_kernel("kernels-community/deformable-detr")
+
+
+def hf_kernels_deformable_detr(
+    value, spatial_shapes, level_start_index, sampling_locations, attention_weights, im2col_step=64
+):
+    """HuggingFace Kernels Deformable DETR Multi-Scale Deformable Attention"""
+    return deformable_detr.ms_deform_attn_forward(
+        value=value,
+        spatial_shapes=spatial_shapes,
+        level_start_index=level_start_index,
+        sampling_loc=sampling_locations,
+        attn_weight=attention_weights,
+        im2col_step=im2col_step
+    )
+
+
+run_benchmark(
+    kernel_type=KernelTypeEnum.DEFORMABLE_DETR,
+    impl_name="hf_kernels_deformable_detr",
+    impl_tags={"family": "hf-kernels", "backend": "cuda"},
+    impl_func=hf_kernels_deformable_detr,
+    dtype="float32",
+)
+
+ +
+
+
+
+
Running deformable_detr benchmark on cuda with 4 workloads.
+
+======================================================================
+PROFILE TRACE: hf_kernels_deformable_detr | cuda_B1_Q100_H8_E256_L4_P4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                             hf_kernels_deformable_detr         0.00%       0.000us         0.00%       0.000us       0.000us     195.201us       770.15%     195.201us     195.201us             1  
+                             hf_kernels_deformable_detr         7.43%     141.524us        99.61%       1.898ms       1.898ms       0.000us         0.00%      26.403us      26.403us             1  
+       _deformable_detr_57c3d32::ms_deform_attn_forward         3.93%      74.960us        92.19%       1.756ms     585.455us      22.464us        88.63%      26.403us       8.801us             3  
+void ms_deformable_im2col_gpu_kernel<float>(int, flo...         0.00%       0.000us         0.00%       0.000us       0.000us      22.464us        88.63%      22.464us       7.488us             3  
+                                            aten::zeros         1.20%      22.800us        85.08%       1.621ms     540.337us       0.000us         0.00%       3.939us       1.313us             3  
+                                            aten::zero_         0.89%      16.910us        82.13%       1.565ms     521.590us       0.000us         0.00%       3.939us       1.313us             3  
+                                            aten::fill_         1.72%      32.820us        81.24%       1.548ms     515.953us       2.882us        11.37%       3.939us       1.313us             3  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.882us        11.37%       2.882us       0.961us             3  
+                                Activity Buffer Request        77.24%       1.472ms        77.24%       1.472ms       1.472ms       1.057us         4.17%       1.057us       1.057us             1  
+                                            aten::empty         1.76%      33.441us         1.76%      33.441us      11.147us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         3.19%      60.842us         3.19%      60.842us      10.140us       0.000us         0.00%       0.000us       0.000us             6  
+                                             aten::view         0.89%      16.922us         0.89%      16.922us       2.820us       0.000us         0.00%       0.000us       0.000us             6  
+                                           aten::select         1.13%      21.591us         1.37%      26.081us       8.694us       0.000us         0.00%       0.000us       0.000us             3  
+                                       aten::as_strided         0.24%       4.490us         0.24%       4.490us       1.497us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.39%       7.340us         0.39%       7.340us       7.340us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.905ms
+Self CUDA time total: 25.346us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_deformable_detr | cuda_B1_Q300_H8_E256_L4_P4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                             hf_kernels_deformable_detr         0.00%       0.000us         0.00%       0.000us       0.000us     144.191us       546.22%     144.191us     144.191us             1  
+                             hf_kernels_deformable_detr         4.39%      75.912us        99.67%       1.722ms       1.722ms       0.000us         0.00%      27.358us      27.358us             1  
+       _deformable_detr_57c3d32::ms_deform_attn_forward         2.01%      34.700us        95.28%       1.646ms     548.647us      23.550us        89.21%      27.358us       9.119us             3  
+void ms_deformable_im2col_gpu_kernel<float>(int, flo...         0.00%       0.000us         0.00%       0.000us       0.000us      23.550us        89.21%      23.550us       7.850us             3  
+                                            aten::zeros         0.49%       8.451us        91.07%       1.573ms     524.424us       0.000us         0.00%       3.808us       1.269us             3  
+                                            aten::zero_         0.50%       8.669us        89.54%       1.547ms     515.616us       0.000us         0.00%       3.808us       1.269us             3  
+                                            aten::fill_         1.60%      27.701us        89.04%       1.538ms     512.727us       2.848us        10.79%       3.808us       1.269us             3  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.848us        10.79%       2.848us       0.949us             3  
+                                Activity Buffer Request        85.90%       1.484ms        85.90%       1.484ms       1.484ms       0.960us         3.64%       0.960us       0.960us             1  
+                                            aten::empty         1.04%      17.971us         1.04%      17.971us       5.990us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         2.40%      41.442us         2.40%      41.442us       6.907us       0.000us         0.00%       0.000us       0.000us             6  
+                                             aten::view         0.54%       9.400us         0.54%       9.400us       1.567us       0.000us         0.00%       0.000us       0.000us             6  
+                                           aten::select         0.66%      11.329us         0.79%      13.720us       4.573us       0.000us         0.00%       0.000us       0.000us             3  
+                                       aten::as_strided         0.14%       2.391us         0.14%       2.391us       0.797us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.33%       5.680us         0.33%       5.680us       5.680us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.728ms
+Self CUDA time total: 26.398us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_deformable_detr | cuda_B2_Q100_H8_E256_L4_P4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                             hf_kernels_deformable_detr         0.00%       0.000us         0.00%       0.000us       0.000us     140.288us       549.37%     140.288us     140.288us             1  
+                             hf_kernels_deformable_detr         4.34%      74.492us        99.67%       1.709ms       1.709ms       0.000us         0.00%      26.464us      26.464us             1  
+       _deformable_detr_57c3d32::ms_deform_attn_forward         1.96%      33.680us        95.32%       1.635ms     544.984us      22.752us        89.10%      26.464us       8.821us             3  
+void ms_deformable_im2col_gpu_kernel<float>(int, flo...         0.00%       0.000us         0.00%       0.000us       0.000us      22.752us        89.10%      22.752us       7.584us             3  
+                                            aten::zeros         0.50%       8.650us        91.19%       1.564ms     521.367us       0.000us         0.00%       3.712us       1.237us             3  
+                                            aten::zero_         0.47%       8.130us        89.69%       1.538ms     512.773us       0.000us         0.00%       3.712us       1.237us             3  
+                                            aten::fill_         1.63%      27.881us        89.21%       1.530ms     510.063us       2.784us        10.90%       3.712us       1.237us             3  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.784us        10.90%       2.784us       0.928us             3  
+                                Activity Buffer Request        86.04%       1.476ms        86.04%       1.476ms       1.476ms       0.928us         3.63%       0.928us       0.928us             1  
+                                            aten::empty         1.00%      17.131us         1.00%      17.131us       5.710us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         2.42%      41.510us         2.42%      41.510us       6.918us       0.000us         0.00%       0.000us       0.000us             6  
+                                             aten::view         0.52%       8.991us         0.52%       8.991us       1.498us       0.000us         0.00%       0.000us       0.000us             6  
+                                           aten::select         0.62%      10.681us         0.77%      13.291us       4.430us       0.000us         0.00%       0.000us       0.000us             3  
+                                       aten::as_strided         0.15%       2.610us         0.15%       2.610us       0.870us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.33%       5.730us         0.33%       5.730us       5.730us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.715ms
+Self CUDA time total: 25.536us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_deformable_detr | cuda_B2_Q300_H8_E256_L4_P4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                             hf_kernels_deformable_detr         0.00%       0.000us         0.00%       0.000us       0.000us     151.934us       322.76%     151.934us     151.934us             1  
+                             hf_kernels_deformable_detr         3.86%      74.313us        99.75%       1.919ms       1.919ms       0.000us         0.00%      48.129us      48.129us             1  
+       _deformable_detr_57c3d32::ms_deform_attn_forward         1.79%      34.420us        95.88%       1.844ms     614.769us      43.968us        93.40%      48.129us      16.043us             3  
+void ms_deformable_im2col_gpu_kernel<float>(int, flo...         0.00%       0.000us         0.00%       0.000us       0.000us      43.968us        93.40%      43.968us      14.656us             3  
+                                            aten::zeros         0.45%       8.600us        92.03%       1.770ms     590.092us       0.000us         0.00%       4.161us       1.387us             3  
+                                            aten::zero_         0.45%       8.690us        90.72%       1.745ms     581.642us       0.000us         0.00%       4.161us       1.387us             3  
+                                            aten::fill_         1.44%      27.641us        90.26%       1.736ms     578.745us       3.105us         6.60%       4.161us       1.387us             3  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       3.105us         6.60%       3.105us       1.035us             3  
+                                Activity Buffer Request        76.84%       1.478ms        76.84%       1.478ms       1.478ms       1.056us         2.24%       1.056us       1.056us             1  
+                                            aten::empty         0.87%      16.750us         0.87%      16.750us       5.583us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        12.74%     245.037us        12.74%     245.037us      40.839us       0.000us         0.00%       0.000us       0.000us             6  
+                                             aten::view         0.49%       9.420us         0.49%       9.420us       1.570us       0.000us         0.00%       0.000us       0.000us             6  
+                                           aten::select         0.66%      12.781us         0.82%      15.781us       5.260us       0.000us         0.00%       0.000us       0.000us             3  
+                                       aten::as_strided         0.16%       3.000us         0.16%       3.000us       1.000us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.25%       4.890us         0.25%       4.890us       4.890us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.924ms
+Self CUDA time total: 47.073us
+
+
+impl                     wl                  p50(ms)  ok
+hf_kernels_deformable_detr cuda_B1_Q100_H8_E256_L4_P4     0.04  True
+hf_kernels_deformable_detr cuda_B1_Q300_H8_E256_L4_P4     0.05  True
+hf_kernels_deformable_detr cuda_B2_Q100_H8_E256_L4_P4     0.05  True
+hf_kernels_deformable_detr cuda_B2_Q300_H8_E256_L4_P4     0.05  True
+
+
+
▶ UV Install Logs
+ +
+
Fetching 7 files: 0%| | 0/7 [00:00<?, ?it/s] +Fetching 7 files: 14%|█▍ | 1/7 [00:00<00:00, 6.20it/s] +Fetching 7 files: 71%|███████▏ | 5/7 [00:00<00:00, 9.26it/s] +Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 12.59it/s]
+
+

Artifacts:

+deformable_detr.jsonl +
+
+
+
+ + + \ No newline at end of file diff --git a/deformable_detr/impls/index.html b/deformable_detr/impls/index.html new file mode 100644 index 0000000000000000000000000000000000000000..507f4753d9c1efbdcbe259d5a8105e4524b0527f --- /dev/null +++ b/deformable_detr/impls/index.html @@ -0,0 +1,89 @@ + + + + + + Index of /deformable_detr/impls + + + +
+ ← back +
+

Index of /deformable_detr/impls

+ + + \ No newline at end of file diff --git a/deformable_detr/impls/torch_deformable_detr.html b/deformable_detr/impls/torch_deformable_detr.html new file mode 100644 index 0000000000000000000000000000000000000000..1d330b066f83130623802310ab8c5a5ceec69b71 --- /dev/null +++ b/deformable_detr/impls/torch_deformable_detr.html @@ -0,0 +1,4434 @@ + + + + + + torch_deformable_detr + + + + + + + + + + + + +
+
+ + ← back + +
light
+
reset
+ +
+
+ +
+
Generated on:
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35 +
+
+ +
+

PyTorch Native - Deformable DETR

+

GPU Info

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: nv | 0.23s + | + +Raw +GitHub +
+
+
+
import subprocess
+print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
+
+ +
+
+
+
+
Fri Oct 31 20:13:34 2025       
++-----------------------------------------------------------------------------------------+
+| NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
+|-----------------------------------------+------------------------+----------------------+
+| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
+| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
+|                                         |                        |               MIG M. |
+|=========================================+========================+======================|
+|   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
+| N/A   43C    P0             83W /  350W |       0MiB /  46068MiB |     60%      Default |
+|                                         |                        |                  N/A |
++-----------------------------------------+------------------------+----------------------+
+
++-----------------------------------------------------------------------------------------+
+| Processes:                                                                              |
+|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
+|        ID   ID                                                               Usage      |
+|=========================================================================================|
+|  No running processes found                                                             |
++-----------------------------------------------------------------------------------------+
+
+
+
+
+ +

Deformable DETR Multi-Scale Deformable Attention Benchmark (PyTorch Native)

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: benchmark | 5.33s + | + +Raw +GitHub +
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "numpy",
+#     "torch==2.8.0",
+#     "kernels-benchmark-tools",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+
+
+def torch_deformable_detr(
+    value, spatial_shapes, level_start_index, sampling_locations, attention_weights, im2col_step=64
+):
+    """
+    PyTorch native reference implementation of multi-scale deformable attention.
+    Uses vectorized bilinear interpolation for reasonable performance.
+    """
+    bs, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
+    _, _, _, channels = value.shape
+
+    output = torch.zeros(bs, num_queries, num_heads, channels, device=value.device, dtype=value.dtype)
+
+    # Split value tensor by levels
+    value_list = value.split([int(h * w) for h, w in spatial_shapes.tolist()], dim=1)
+
+    # Iterate through each level (can't avoid this loop easily)
+    for level_idx in range(num_levels):
+        h, w = spatial_shapes[level_idx].tolist()
+        value_level = value_list[level_idx]  # (bs, h*w, num_heads, channels)
+
+        # Reshape to spatial grid: (bs, num_heads, channels, h, w)
+        value_spatial = value_level.reshape(bs, h, w, num_heads, channels).permute(0, 3, 4, 1, 2)
+
+        # Get sampling locations and weights for this level
+        # loc: (bs, num_queries, num_heads, num_points, 2)
+        loc = sampling_locations[:, :, :, level_idx, :, :]
+        # weight: (bs, num_queries, num_heads, num_points)
+        weight = attention_weights[:, :, :, level_idx, :]
+
+        # Convert normalized coordinates to pixel coordinates
+        # loc[..., 0] is x (width), loc[..., 1] is y (height)
+        x = loc[..., 0] * w - 0.5  # (bs, num_queries, num_heads, num_points)
+        y = loc[..., 1] * h - 0.5
+
+        # Get integer coordinates for bilinear interpolation
+        x0 = torch.floor(x).long()
+        y0 = torch.floor(y).long()
+        x1 = x0 + 1
+        y1 = y0 + 1
+
+        # Compute interpolation weights BEFORE clamping (important!)
+        lw = x - x0.float()  # weight for x direction
+        lh = y - y0.float()  # weight for y direction
+        hw = 1 - lw
+        hh = 1 - lh
+
+        # Create mask for valid sample locations
+        valid = (y > -1) & (x > -1) & (y < h) & (x < w)
+
+        # Create masks for each corner being in bounds
+        mask_tl = ((y0 >= 0) & (x0 >= 0)).unsqueeze(-1).float()
+        mask_tr = ((y0 >= 0) & (x1 <= w - 1)).unsqueeze(-1).float()
+        mask_bl = ((y1 <= h - 1) & (x0 >= 0)).unsqueeze(-1).float()
+        mask_br = ((y1 <= h - 1) & (x1 <= w - 1)).unsqueeze(-1).float()
+
+        # Clamp coordinates for safe indexing
+        x0_clamped = torch.clamp(x0, 0, w - 1)
+        x1_clamped = torch.clamp(x1, 0, w - 1)
+        y0_clamped = torch.clamp(y0, 0, h - 1)
+        y1_clamped = torch.clamp(y1, 0, h - 1)
+
+        # Bilinear interpolation weights for all 4 corners
+        w_tl = (hh * hw).unsqueeze(-1)  # top-left: (bs, num_queries, num_heads, num_points, 1)
+        w_tr = (hh * lw).unsqueeze(-1)  # top-right
+        w_bl = (lh * hw).unsqueeze(-1)  # bottom-left
+        w_br = (lh * lw).unsqueeze(-1)  # bottom-right
+
+        # Gather values from the 4 corners using advanced indexing
+        batch_idx = torch.arange(bs, device=value.device).view(bs, 1, 1, 1).expand(bs, num_queries, num_heads, num_points)
+        head_idx = torch.arange(num_heads, device=value.device).view(1, 1, num_heads, 1).expand(bs, num_queries, num_heads, num_points)
+
+        # Gather corner values with clamped indices, then apply corner masks
+        v_tl = value_spatial[batch_idx, head_idx, :, y0_clamped, x0_clamped] * mask_tl
+        v_tr = value_spatial[batch_idx, head_idx, :, y0_clamped, x1_clamped] * mask_tr
+        v_bl = value_spatial[batch_idx, head_idx, :, y1_clamped, x0_clamped] * mask_bl
+        v_br = value_spatial[batch_idx, head_idx, :, y1_clamped, x1_clamped] * mask_br
+
+        # Bilinear interpolation
+        sampled = w_tl * v_tl + w_tr * v_tr + w_bl * v_bl + w_br * v_br
+
+        # Apply valid mask (only accumulate if entire sample location is valid)
+        sampled = sampled * valid.unsqueeze(-1).float()
+
+        # Apply attention weights and sum over points
+        # weight: (bs, num_queries, num_heads, num_points)
+        # Expand weight: (bs, num_queries, num_heads, num_points, 1)
+        weighted_sampled = sampled * weight.unsqueeze(-1)
+
+        # Sum over points: (bs, num_queries, num_heads, channels)
+        output += weighted_sampled.sum(dim=3)
+
+    # Flatten last two dimensions to match kernel output
+    return output.reshape(bs, num_queries, num_heads * channels)
+
+
+run_benchmark(
+    kernel_type=KernelTypeEnum.DEFORMABLE_DETR,
+    impl_name="torch_eager",
+    impl_tags={"family": "pytorch", "backend": "eager"},
+    impl_func=torch_deformable_detr,
+    dtype="float32",
+)
+
+ +
+
+
+
+
Running deformable_detr benchmark on cuda with 4 workloads.
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_Q100_H8_E256_L4_P4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us      20.095ms      1353.99%      20.095ms      20.095ms             1  
+                                            torch_eager        21.57%       4.703ms        99.97%      21.796ms      21.796ms       0.000us         0.00%       1.485ms       1.485ms             1  
+                                            aten::index         4.62%       1.006ms        16.78%       3.660ms      76.241us     237.342us        15.99%     371.712us       7.744us            48  
+                                            aten::copy_         4.87%       1.061ms        11.32%       2.469ms      11.275us     365.385us        24.62%     365.385us       1.668us           219  
+                                              aten::mul         5.80%       1.265ms         9.92%       2.163ms      11.267us     294.264us        19.83%     294.264us       1.533us           192  
+void at::native::index_elementwise_kernel<128, 4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     237.342us        15.99%     237.342us       4.945us            48  
+                                               aten::to         0.67%     145.268us        11.20%       2.441ms      14.275us       0.000us         0.00%     231.015us       1.351us           171  
+                                         aten::_to_copy         2.25%     489.538us        10.53%       2.296ms      18.665us       0.000us         0.00%     231.015us       1.878us           123  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     202.558us        13.65%     202.558us       1.688us           120  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     167.074us        11.26%     167.074us       1.989us            84  
+                                       aten::contiguous         0.40%      86.639us         8.70%       1.898ms      19.769us       0.000us         0.00%     134.370us       1.400us            96  
+                                            aten::clone         0.85%     185.683us         8.31%       1.811ms      18.866us       0.000us         0.00%     134.370us       1.400us            96  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     134.370us         9.05%     134.370us       1.400us            96  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     115.390us         7.77%     115.390us       1.202us            96  
+                                          aten::__and__         0.63%     137.184us         4.49%     979.904us      11.666us       0.000us         0.00%     100.670us       1.198us            84  
+                                      aten::bitwise_and         2.39%     521.552us         3.87%     842.720us      10.032us     100.670us         6.78%     100.670us       1.198us            84  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     100.670us         6.78%     100.670us       1.198us            84  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      85.858us         5.78%      85.858us       1.192us            72  
+                                              aten::sub         2.24%     488.685us         3.68%     801.476us      11.132us      78.884us         5.32%      78.884us       1.096us            72  
+                                              aten::add         1.55%     338.597us         2.59%     564.753us       9.413us      74.082us         4.99%      74.082us       1.235us            60  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 21.803ms
+Self CUDA time total: 1.484ms
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_Q300_H8_E256_L4_P4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us      18.852ms      1182.31%      18.852ms      18.852ms             1  
+                                            torch_eager        20.99%       4.304ms        99.97%      20.495ms      20.495ms       0.000us         0.00%       1.595ms       1.595ms             1  
+                                            aten::index         4.61%     945.020us        16.80%       3.444ms      71.750us     251.167us        15.75%     382.850us       7.976us            48  
+                                            aten::copy_         5.04%       1.033ms        11.78%       2.414ms      11.023us     364.991us        22.89%     364.991us       1.667us           219  
+                                              aten::mul         5.94%       1.218ms        10.22%       2.095ms      10.911us     359.138us        22.52%     359.138us       1.871us           192  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     267.618us        16.78%     267.618us       2.230us           120  
+void at::native::index_elementwise_kernel<128, 4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     251.167us        15.75%     251.167us       5.233us            48  
+                                               aten::to         0.59%     120.975us        11.17%       2.290ms      13.390us       0.000us         0.00%     233.308us       1.364us           171  
+                                         aten::_to_copy         2.01%     411.895us        10.58%       2.169ms      17.632us       0.000us         0.00%     233.308us       1.897us           123  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     168.797us        10.59%     168.797us       2.009us            84  
+                                       aten::contiguous         0.41%      84.261us         8.87%       1.818ms      18.936us       0.000us         0.00%     131.683us       1.372us            96  
+                                            aten::clone         0.84%     172.318us         8.46%       1.734ms      18.058us       0.000us         0.00%     131.683us       1.372us            96  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     131.683us         8.26%     131.683us       1.372us            96  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     118.123us         7.41%     118.123us       1.230us            96  
+                                          aten::__and__         0.40%      81.276us         4.41%     903.196us      10.752us       0.000us         0.00%     104.833us       1.248us            84  
+                                      aten::bitwise_and         2.46%     504.088us         4.01%     821.920us       9.785us     104.833us         6.57%     104.833us       1.248us            84  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     104.833us         6.57%     104.833us       1.248us            84  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     104.190us         6.53%     104.190us       1.447us            72  
+                                              aten::add         1.62%     331.582us         2.72%     557.857us       9.298us      91.491us         5.74%      91.491us       1.525us            60  
+                                              aten::sub         2.17%     445.533us         3.70%     758.959us      10.541us      80.509us         5.05%      80.509us       1.118us            72  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 20.501ms
+Self CUDA time total: 1.595ms
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_Q100_H8_E256_L4_P4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us      18.792ms      1222.95%      18.792ms      18.792ms             1  
+                                            torch_eager        21.02%       4.299ms        99.97%      20.449ms      20.449ms       0.000us         0.00%       1.538ms       1.538ms             1  
+                                            aten::index         4.62%     944.347us        16.78%       3.432ms      71.497us     243.904us        15.87%     378.785us       7.891us            48  
+                                            aten::copy_         5.14%       1.051ms        11.72%       2.396ms      10.942us     368.961us        24.01%     368.961us       1.685us           219  
+                                              aten::mul         5.96%       1.219ms        10.23%       2.092ms      10.898us     325.334us        21.17%     325.334us       1.694us           192  
+void at::native::index_elementwise_kernel<128, 4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     243.904us        15.87%     243.904us       5.081us            48  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     234.457us        15.26%     234.457us       1.954us           120  
+                                               aten::to         0.61%     125.558us        11.02%       2.255ms      13.184us       0.000us         0.00%     234.080us       1.369us           171  
+                                         aten::_to_copy         1.92%     392.900us        10.41%       2.129ms      17.309us       0.000us         0.00%     234.080us       1.903us           123  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     169.246us        11.01%     169.246us       2.015us            84  
+                                       aten::contiguous         0.42%      85.559us         8.81%       1.802ms      18.772us       0.000us         0.00%     134.881us       1.405us            96  
+                                            aten::clone         0.80%     164.449us         8.39%       1.717ms      17.880us       0.000us         0.00%     134.881us       1.405us            96  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     134.881us         8.78%     134.881us       1.405us            96  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     115.650us         7.53%     115.650us       1.205us            96  
+                                          aten::__and__         0.39%      78.814us         4.36%     891.116us      10.609us       0.000us         0.00%     101.539us       1.209us            84  
+                                      aten::bitwise_and         2.44%     499.687us         3.97%     812.302us       9.670us     101.539us         6.61%     101.539us       1.209us            84  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     101.539us         6.61%     101.539us       1.209us            84  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      96.065us         6.25%      96.065us       1.334us            72  
+                                              aten::add         1.62%     331.717us         2.71%     554.333us       9.239us      83.900us         5.46%      83.900us       1.398us            60  
+                                              aten::sub         2.21%     451.413us         3.69%     755.537us      10.494us      79.361us         5.16%      79.361us       1.102us            72  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 20.454ms
+Self CUDA time total: 1.537ms
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_Q300_H8_E256_L4_P4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us      19.115ms      1086.36%      19.115ms      19.115ms             1  
+                                            torch_eager        21.90%       4.346ms        99.98%      19.842ms      19.842ms       0.000us         0.00%       1.761ms       1.761ms             1  
+                                              aten::mul         6.18%       1.226ms        10.60%       2.104ms      10.960us     450.887us        25.63%     450.887us       2.348us           192  
+                                            aten::index         4.92%     977.403us        17.78%       3.530ms      73.537us     282.433us        16.05%     420.451us       8.759us            48  
+                                            aten::copy_         5.20%       1.031ms        12.05%       2.392ms      10.922us     372.637us        21.18%     372.637us       1.702us           219  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     357.955us        20.34%     357.955us       2.983us           120  
+void at::native::index_elementwise_kernel<128, 4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     282.433us        16.05%     282.433us       5.884us            48  
+                                               aten::to         0.65%     128.684us        11.66%       2.315ms      13.536us       0.000us         0.00%     234.619us       1.372us           171  
+                                         aten::_to_copy         2.23%     442.466us        11.01%       2.186ms      17.772us       0.000us         0.00%     234.619us       1.907us           123  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     170.397us         9.68%     170.397us       2.029us            84  
+                                       aten::contiguous         0.44%      87.582us         9.26%       1.837ms      19.140us       0.000us         0.00%     138.018us       1.438us            96  
+                                            aten::clone         0.85%     168.452us         8.82%       1.750ms      18.228us       0.000us         0.00%     138.018us       1.438us            96  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     138.018us         7.84%     138.018us       1.438us            96  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     129.055us         7.33%     129.055us       1.792us            72  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     117.244us         6.66%     117.244us       1.221us            96  
+                                              aten::add         1.68%     334.180us         2.81%     557.305us       9.288us     113.660us         6.46%     113.660us       1.894us            60  
+                                          aten::__and__         0.41%      80.800us         4.55%     902.601us      10.745us       0.000us         0.00%     105.726us       1.259us            84  
+                                      aten::bitwise_and         2.56%     508.561us         4.14%     821.801us       9.783us     105.726us         6.01%     105.726us       1.259us            84  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     105.726us         6.01%     105.726us       1.259us            84  
+                                              aten::sub         2.25%     446.108us         3.80%     754.277us      10.476us      82.273us         4.68%      82.273us       1.143us            72  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 19.847ms
+Self CUDA time total: 1.760ms
+
+
+impl                     wl                  p50(ms)  ok
+torch_eager              cuda_B1_Q100_H8_E256_L4_P4     3.39  True
+torch_eager              cuda_B1_Q300_H8_E256_L4_P4     4.01  True
+torch_eager              cuda_B2_Q100_H8_E256_L4_P4     4.02  True
+torch_eager              cuda_B2_Q300_H8_E256_L4_P4     4.02  True
+
+
+

Artifacts:

+deformable_detr.jsonl +
+
+
+
+ + + \ No newline at end of file diff --git a/deformable_detr/index.html b/deformable_detr/index.html new file mode 100644 index 0000000000000000000000000000000000000000..8e1ad7fe09342a610e525c8bba679a7f74857855 --- /dev/null +++ b/deformable_detr/index.html @@ -0,0 +1,89 @@ + + + + + + Index of /deformable_detr + + + +
+ ← back +
+

Index of /deformable_detr

+ + + \ No newline at end of file diff --git a/deformable_detr/results/artifacts/combine/latency.svg b/deformable_detr/results/artifacts/combine/latency.svg new file mode 100644 index 0000000000000000000000000000000000000000..cfe61b52935bc93cabc302ceb7b7fc02981aa5f7 --- /dev/null +++ b/deformable_detr/results/artifacts/combine/latency.svg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b38828b5c85834f31812d3f314ebdc3cc2e8481610a6d31b84a4f9b0ad78c0f2 +size 17800 diff --git a/deformable_detr/results/cells/combine.py b/deformable_detr/results/cells/combine.py new file mode 100644 index 0000000000000000000000000000000000000000..973c7b81cc8cea6af69ab5e32268c4e63e71c8bb --- /dev/null +++ b/deformable_detr/results/cells/combine.py @@ -0,0 +1,26 @@ +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "numpy", +# "torch==2.8.0", +# "kernels-benchmark-tools", +# "matplotlib", +# ] +# +# [tool.uv.sources] +# kernels-benchmark-tools = { path = "../../../../../tools", editable = true } +# /// +from kernels_benchmark_tools.core.visuals import generate_combined_results + +# Map display names to uvnote environment variables +cache_env_map = { + "HF Kernels Deformable DETR": "UVNOTE_FILE_HF_KERNELS_DEFORMABLE_DETR_BENCHMARK", + "PyTorch Deformable DETR": "UVNOTE_FILE_TORCH_DEFORMABLE_DETR_BENCHMARK", +} + +# Generate combined results with visualization +generate_combined_results( + cache_env_map=cache_env_map, + output_filename="deformable_detr.jsonl", + svg_filename="latency.svg" +) \ No newline at end of file diff --git a/deformable_detr/results/combined_results.html b/deformable_detr/results/combined_results.html new file mode 100644 index 0000000000000000000000000000000000000000..a985624a2d9079877fe0cd1dcdefc5494402713c --- /dev/null +++ b/deformable_detr/results/combined_results.html @@ -0,0 +1,4805 @@ + + + + + + Deformable DETR Benchmark - Combined Results + + + + + + + + + + + + +
+
+ + ← back + +
light
+
reset
+ +
+
+ +
+
Generated on:
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35 +
+
+ +
+

Deformable DETR Multi-Scale Deformable Attention Benchmarks - Aggregated Results

+

This document combines benchmark results from multiple Deformable DETR implementations.

+

Combined Summary and Visualization

+
+ + + + + + + 2025-10-31T20:14:23.345627 + image/svg+xml + + + Matplotlib v3.10.7, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + cuda_B1_Q100_H8_E256_L4_P4 + + + + + + + + + + + + + cuda_B1_Q300_H8_E256_L4_P4 + + + + + + + + + + + + + cuda_B2_Q100_H8_E256_L4_P4 + + + + + + + + + + + + + cuda_B2_Q300_H8_E256_L4_P4 + + + + Workload + + + + + + + + + + + + + + + + + 0.0 + + + + + + + + + + + + + 0.5 + + + + + + + + + + + + + 1.0 + + + + + + + + + + + + + 1.5 + + + + + + + + + + + + + 2.0 + + + + + + + + + + + + + 2.5 + + + + + + + + + + + + + 3.0 + + + + + + + + + + + + + 3.5 + + + + + + + + + + + + + 4.0 + + + + Latency P50 (ms) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Attention Implementation Latency + + + + + + + + + + + + + hf_kernels_deformable_detr + + + + + + + + + torch_eager + + + + + + + + + + +
+ +
+
+ +▶ code +▼ output + ▶ uv-logs + | +Cell: combine | 4.34s + | + +Raw +
+ +
+
======================================================================
+LOADING BENCHMARK DATA
+======================================================================
+✓ HF Kernels Deformable DETR    : /__w/kernels-benchmarks/kernels-benchmarks/benches/deformable_detr/impls/.uvnote/cache/8ab95d7f8f4c6a375b95806e646e4e6f12f0749960d319cf7587215b378ccfa9
+✓ PyTorch Deformable DETR       : /__w/kernels-benchmarks/kernels-benchmarks/benches/deformable_detr/impls/.uvnote/cache/9c0a40cf66719a0b460ebb0ca3b41bcaf6c5486905bbf2045a65be2710694dfa
+
+  ✓ Found HF Kernels Deformable DETR
+     Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/deformable_detr/impls/.uvnote/cache/8ab95d7f8f4c6a375b95806e646e4e6f12f0749960d319cf7587215b378ccfa9/deformable_detr.jsonl
+  ✓ Found PyTorch Deformable DETR
+     Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/deformable_detr/impls/.uvnote/cache/9c0a40cf66719a0b460ebb0ca3b41bcaf6c5486905bbf2045a65be2710694dfa/deformable_detr.jsonl
+
+======================================================================
+Summary: 2 found, 0 skipped, 0 missing
+======================================================================
+
+COMBINED BENCHMARK SUMMARY
+
+impl                     wl                  p50(ms)  ok
+hf_kernels_deformable_detr cuda_B1_Q100_H8_E256_L4_P4     0.04  True
+hf_kernels_deformable_detr cuda_B1_Q300_H8_E256_L4_P4     0.05  True
+hf_kernels_deformable_detr cuda_B2_Q100_H8_E256_L4_P4     0.05  True
+hf_kernels_deformable_detr cuda_B2_Q300_H8_E256_L4_P4     0.05  True
+torch_eager              cuda_B1_Q100_H8_E256_L4_P4     3.39  True
+torch_eager              cuda_B1_Q300_H8_E256_L4_P4     4.01  True
+torch_eager              cuda_B2_Q100_H8_E256_L4_P4     4.02  True
+torch_eager              cuda_B2_Q300_H8_E256_L4_P4     4.02  True
+
+GENERATING COMBINED VISUALIZATION
+
+Loaded 8 records
+✓ Visualization saved as latency.svg
+Saved latency.png
+✓ Visualization saved as latency.svg
+✓ SVG visualization ready!
+
+ANALYSIS COMPLETE
+Total implementations analyzed: 2
+
+Implementations included:
+  ✓ HF Kernels Deformable DETR
+  ✓ PyTorch Deformable DETR
+
+
+
▶ UV Install Logs
+ +
+
+

Artifacts:

+latency.svg +
+ + + + + + + 2025-10-31T20:14:23.345627 + image/svg+xml + + + Matplotlib v3.10.7, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + cuda_B1_Q100_H8_E256_L4_P4 + + + + + + + + + + + + + cuda_B1_Q300_H8_E256_L4_P4 + + + + + + + + + + + + + cuda_B2_Q100_H8_E256_L4_P4 + + + + + + + + + + + + + cuda_B2_Q300_H8_E256_L4_P4 + + + + Workload + + + + + + + + + + + + + + + + + 0.0 + + + + + + + + + + + + + 0.5 + + + + + + + + + + + + + 1.0 + + + + + + + + + + + + + 1.5 + + + + + + + + + + + + + 2.0 + + + + + + + + + + + + + 2.5 + + + + + + + + + + + + + 3.0 + + + + + + + + + + + + + 3.5 + + + + + + + + + + + + + 4.0 + + + + Latency P50 (ms) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Attention Implementation Latency + + + + + + + + + + + + + hf_kernels_deformable_detr + + + + + + + + + torch_eager + + + + + + + + + + +
+
+
+
+
+ + + \ No newline at end of file diff --git a/deformable_detr/results/index.html b/deformable_detr/results/index.html new file mode 100644 index 0000000000000000000000000000000000000000..259f497868f81b516b1f0c893e4974cda430c731 --- /dev/null +++ b/deformable_detr/results/index.html @@ -0,0 +1,88 @@ + + + + + + Index of /deformable_detr/results + + + +
+ ← back +
+

Index of /deformable_detr/results

+ + + \ No newline at end of file diff --git a/flash_attn/impls/artifacts/benchmark/attention.jsonl b/flash_attn/impls/artifacts/benchmark/attention.jsonl index de592afa82ec05256019431f6592e8e321594c40..d381f496ddfa4abddae090de1e302f3856ab3fc4 100644 --- a/flash_attn/impls/artifacts/benchmark/attention.jsonl +++ b/flash_attn/impls/artifacts/benchmark/attention.jsonl @@ -1,6 +1,6 @@ -{"ts": "2025-10-30T15:53:30Z", "run": "30999c763e9c4aa995d0df5078964128", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.9094910000158052, "p50": 0.9113720000186731, "p90": 0.9181919999718957, "mean": 0.9141214000010223, "iqr": 0.007780999965234514, "raw_times": [0.9104110000066612, 0.9094910000158052, 0.9113720000186731, 0.9181919999718957, 0.9211409999920761], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.9259819999556385, "peak_bytes": 295567360, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null} -{"ts": "2025-10-30T15:53:30Z", "run": "30999c763e9c4aa995d0df5078964128", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.9480720000283327, "p50": 0.9496129999888581, "p90": 0.9558429999856344, "mean": 0.952826599996115, "iqr": 0.00735100002202671, "raw_times": [0.9480720000283327, 0.9484919999636077, 0.9496129999888581, 0.9558429999856344, 0.962113000014142], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.9554529999604711, "peak_bytes": 304742400, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null} -{"ts": "2025-10-30T15:53:30Z", "run": "30999c763e9c4aa995d0df5078964128", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0061439999731192, "p50": 1.0189639999680367, "p90": 1.0215840000000753, "mean": 1.017895999996199, "iqr": 0.0038299999687296804, "raw_times": [1.0189639999680367, 1.025034000008418, 1.0177540000313456, 1.0061439999731192, 1.0215840000000753], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.0171540000101231, "peak_bytes": 307494912, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null} -{"ts": "2025-10-30T15:53:30Z", "run": "30999c763e9c4aa995d0df5078964128", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0146539999595916, "p50": 1.019383999960155, "p90": 1.0202839999919888, "mean": 1.018159799980367, "iqr": 0.004200999967451935, "raw_times": [1.0202839999919888, 1.0146539999595916, 1.0160830000245369, 1.0203939999655631, 1.019383999960155], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.0248149999938505, "peak_bytes": 311296000, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null} -{"ts": "2025-10-30T15:53:30Z", "run": "30999c763e9c4aa995d0df5078964128", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.1737179999613545, "p50": 1.184327000032681, "p90": 1.1859380000487363, "mean": 1.186479800003326, "iqr": 0.010300000042207103, "raw_times": [1.1756380000065292, 1.1737179999613545, 1.1859380000487363, 1.184327000032681, 1.2127779999673294], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.1959679999904438, "peak_bytes": 315621376, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null} -{"ts": "2025-10-30T15:53:30Z", "run": "30999c763e9c4aa995d0df5078964128", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.1702179999701912, "p50": 1.1838479999823903, "p90": 1.1906280000175684, "mean": 1.1843698000006952, "iqr": 0.016700999992735888, "raw_times": [1.1739270000248325, 1.1702179999701912, 1.1838479999823903, 1.1906280000175684, 1.2032280000084938], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.1880579999683505, "peak_bytes": 319946752, "ok": true, "absmax": 0.125, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.125, "mae": 0.000362396240234375, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null} +{"ts": "2025-10-31T20:13:46Z", "run": "0cb1af490a594cbca21d4dd4012a3c10", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.208432000112225, "p50": 1.215130999980829, "p90": 1.2198710001030122, "mean": 1.215487200033749, "iqr": 0.006680000069536618, "raw_times": [1.2208109999392036, 1.208432000112225, 1.2198710001030122, 1.2131910000334756, 1.215130999980829], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2240119999660237, "peak_bytes": 295567360, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null} +{"ts": "2025-10-31T20:13:46Z", "run": "0cb1af490a594cbca21d4dd4012a3c10", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.26713200006634, "p50": 1.2766830000146001, "p90": 1.277253000125711, "mean": 1.2749268000789016, "iqr": 0.004750000016429112, "raw_times": [1.277253000125711, 1.26713200006634, 1.2766830000146001, 1.281063000078575, 1.2725030001092819], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2717629999769997, "peak_bytes": 304742400, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null} +{"ts": "2025-10-31T20:13:46Z", "run": "0cb1af490a594cbca21d4dd4012a3c10", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.2928539999847999, "p50": 1.3003640001443273, "p90": 1.3163240000721999, "mean": 1.3067478000721167, "iqr": 0.01689100008661626, "raw_times": [1.3003640001443273, 1.2928539999847999, 1.2994329999855836, 1.3163240000721999, 1.3247640001736727], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.3026630001604644, "peak_bytes": 307494912, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null} +{"ts": "2025-10-31T20:13:46Z", "run": "0cb1af490a594cbca21d4dd4012a3c10", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.3232850001259067, "p50": 1.3295650001055037, "p90": 1.3361950000216893, "mean": 1.332684600038192, "iqr": 0.007890999995652237, "raw_times": [1.328304000026037, 1.3361950000216893, 1.3295650001055037, 1.3232850001259067, 1.3460739999118232], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.3245140000890387, "peak_bytes": 311296000, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null} +{"ts": "2025-10-31T20:13:46Z", "run": "0cb1af490a594cbca21d4dd4012a3c10", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.4790479999646777, "p50": 1.4950690001569455, "p90": 1.4989779999723396, "mean": 1.4914904000306706, "iqr": 0.017840000055002747, "raw_times": [1.5032190001420531, 1.4950690001569455, 1.4790479999646777, 1.4811379999173369, 1.4989779999723396], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.5107090000583412, "peak_bytes": 315621376, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null} +{"ts": "2025-10-31T20:13:46Z", "run": "0cb1af490a594cbca21d4dd4012a3c10", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.511368999899787, "p50": 1.5117090001695033, "p90": 1.512698999931672, "mean": 1.516499199988175, "iqr": 0.00113999999484804, "raw_times": [1.511368999899787, 1.512698999931672, 1.5117090001695033, 1.511558999936824, 1.5351600000030885], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.5183190000698232, "peak_bytes": 319946752, "ok": true, "absmax": 0.125, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.125, "mae": 0.0003566741943359375, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null} diff --git a/flash_attn/impls/cells/benchmark.py b/flash_attn/impls/cells/benchmark.py index 15f02e2ed444e10eba9708f3f69247414b6c962b..8f163bdd918898ced9e858cd4197a85572d7ec8e 100644 --- a/flash_attn/impls/cells/benchmark.py +++ b/flash_attn/impls/cells/benchmark.py @@ -4,7 +4,6 @@ # "numpy", # "torch==2.8.0", # "kernels-benchmark-tools", -# "kernels", # ] # # [tool.uv.sources] @@ -13,19 +12,18 @@ import torch import sys from kernels_benchmark_tools import KernelTypeEnum, run_benchmark -from kernels import get_kernel -# Load the flash attention 3 kernel -hf_kernels_flash_attn3 = get_kernel("kernels-community/flash-attn3") - -def hf_flash_attention3(query, key, value): - return hf_kernels_flash_attn3.flash_attn_func(query, key, value, causal=False)[0] +def torch_flash(q, k, v): + qt, kt, vt = (x.transpose(1, 2).contiguous() for x in (q, k, v)) + with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.FLASH_ATTENTION): + o = torch.nn.functional.scaled_dot_product_attention(qt, kt, vt) + return o.transpose(1, 2).contiguous() run_benchmark( kernel_type=KernelTypeEnum.ATTENTION, - impl_name="hf_kernels_flash_attn3", - impl_tags={"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, - impl_func=hf_flash_attention3, + impl_name="torch_flash_ma", + impl_tags={"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, + impl_func=torch_flash, ) \ No newline at end of file diff --git a/flash_attn/impls/flash_attention.html b/flash_attn/impls/flash_attention.html index 501ea20e924b7038a53903e7992899b1953d98eb..1852a8c0fb83365b1e619b7e38354ebd1d45d747 100644 --- a/flash_attn/impls/flash_attention.html +++ b/flash_attn/impls/flash_attention.html @@ -4110,7 +4110,7 @@ Cell: nv | 0.21s | Raw -GitHub +GitHub
@@ -4123,7 +4123,7 @@ Cell: nv | 0.21s
-
Thu Oct 30 15:52:36 2025       
+
Fri Oct 31 20:13:43 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -4132,7 +4132,7 @@ Cell: nv | 0.21s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   30C    P0             75W /  350W |       0MiB /  46068MiB |     11%      Default |
+| N/A   43C    P0             83W /  350W |       0MiB /  46068MiB |     11%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -4154,13 +4154,13 @@ Cell: nv | 0.21s
 
 ▼ code 
 ▼ output
- ▶ uv-logs
+ ▶ uv-logs
  | 
-Cell: benchmark | 7.50s
+Cell: benchmark | 3.87s
  | 
 
 Raw
-GitHub
+GitHub
 
@@ -4207,29 +4207,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L128_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.587ms 102.23% 3.587ms 3.587ms 1 - torch_flash_ma 7.11% 370.236us 47.42% 2.468ms 2.468ms 0.000us 0.00% 3.549ms 3.549ms 1 - aten::scaled_dot_product_attention 0.85% 44.391us 4.44% 231.334us 77.111us 0.000us 0.00% 2.791ms 930.498us 3 - aten::_scaled_dot_product_flash_attention 0.51% 26.381us 3.59% 186.943us 62.314us 0.000us 0.00% 2.791ms 930.498us 3 - aten::_flash_attention_forward 0.76% 39.658us 2.57% 134.002us 44.667us 2.791ms 79.55% 2.791ms 930.498us 3 -void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.791ms 79.55% 2.791ms 930.498us 3 - aten::contiguous 0.30% 15.641us 34.37% 1.789ms 149.098us 0.000us 0.00% 757.697us 63.141us 12 - aten::clone 0.74% 38.596us 34.07% 1.774ms 147.794us 0.000us 0.00% 757.697us 63.141us 12 - aten::copy_ 1.78% 92.553us 31.63% 1.647ms 137.218us 717.505us 20.45% 757.697us 63.141us 12 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 717.505us 20.45% 717.505us 59.792us 12 - Activity Buffer Request 27.90% 1.452ms 27.90% 1.452ms 1.452ms 40.192us 1.15% 40.192us 40.192us 1 - aten::transpose 1.49% 77.390us 2.00% 104.302us 4.346us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.52% 26.912us 0.52% 26.912us 1.121us 0.000us 0.00% 0.000us 0.000us 24 - aten::empty_like 0.55% 28.453us 2.13% 110.953us 7.397us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.93% 100.211us 1.93% 100.211us 4.175us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 2.45% 127.363us 2.45% 127.363us 8.491us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty_strided 0.32% 16.580us 0.32% 16.580us 5.527us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceGetAttribute 0.05% 2.441us 0.05% 2.441us 0.407us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.18% 9.241us 0.18% 9.241us 3.080us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 52.58% 2.737ms 52.58% 2.737ms 2.737ms 0.000us 0.00% 0.000us 0.000us 1 + torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.600ms 101.99% 3.600ms 3.600ms 1 + torch_flash_ma 6.70% 350.157us 46.68% 2.439ms 2.439ms 0.000us 0.00% 3.570ms 3.570ms 1 + aten::scaled_dot_product_attention 0.81% 42.281us 4.26% 222.626us 74.209us 0.000us 0.00% 2.816ms 938.781us 3 + aten::_scaled_dot_product_flash_attention 0.52% 27.002us 3.45% 180.345us 60.115us 0.000us 0.00% 2.816ms 938.781us 3 + aten::_flash_attention_forward 0.79% 41.210us 2.54% 132.453us 44.151us 2.816ms 79.78% 2.816ms 938.781us 3 +void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.816ms 79.78% 2.816ms 938.781us 3 + aten::contiguous 0.29% 15.041us 34.44% 1.800ms 149.962us 0.000us 0.00% 753.884us 62.824us 12 + aten::clone 0.75% 38.969us 34.15% 1.785ms 148.709us 0.000us 0.00% 753.884us 62.824us 12 + aten::copy_ 1.73% 90.324us 31.78% 1.661ms 138.388us 713.788us 20.22% 753.884us 62.824us 12 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 713.788us 20.22% 713.788us 59.482us 12 + Activity Buffer Request 28.08% 1.467ms 28.08% 1.467ms 1.467ms 40.096us 1.14% 40.096us 40.096us 1 + aten::transpose 1.25% 65.371us 1.68% 87.543us 3.648us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.42% 22.172us 0.42% 22.172us 0.924us 0.000us 0.00% 0.000us 0.000us 24 + aten::empty_like 0.53% 27.463us 2.06% 107.524us 7.168us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.78% 93.220us 1.78% 93.220us 3.884us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 2.49% 130.035us 2.49% 130.035us 8.669us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty_strided 0.32% 16.730us 0.32% 16.730us 5.577us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceGetAttribute 0.05% 2.690us 0.05% 2.690us 0.448us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.17% 9.000us 0.17% 9.000us 3.000us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 53.32% 2.786ms 53.32% 2.786ms 2.786ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 5.205ms -Self CUDA time total: 3.509ms +Self CPU time total: 5.225ms +Self CUDA time total: 3.530ms @@ -4239,29 +4239,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L256_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_flash_ma 4.72% 248.136us 41.78% 2.196ms 2.196ms 0.000us 0.00% 3.803ms 3.803ms 1 - torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.759ms 100.28% 3.759ms 3.759ms 1 - aten::scaled_dot_product_attention 0.51% 26.852us 3.40% 178.734us 59.578us 0.000us 0.00% 2.990ms 996.607us 3 - aten::_scaled_dot_product_flash_attention 0.35% 18.418us 2.89% 151.882us 50.627us 0.000us 0.00% 2.990ms 996.607us 3 - aten::_flash_attention_forward 0.65% 34.063us 2.10% 110.562us 36.854us 2.990ms 79.76% 2.990ms 996.607us 3 -void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.990ms 79.76% 2.990ms 996.607us 3 - aten::contiguous 0.19% 10.079us 32.75% 1.721ms 143.446us 0.000us 0.00% 813.629us 67.802us 12 - aten::clone 0.54% 28.151us 32.56% 1.711ms 142.606us 0.000us 0.00% 813.629us 67.802us 12 - aten::copy_ 1.97% 103.281us 30.84% 1.621ms 135.084us 758.782us 20.24% 813.629us 67.802us 12 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 758.782us 20.24% 758.782us 63.232us 12 - Activity Buffer Request 27.29% 1.434ms 27.29% 1.434ms 1.434ms 54.847us 1.46% 54.847us 54.847us 1 - aten::transpose 0.98% 51.741us 1.34% 70.423us 2.934us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.36% 18.682us 0.36% 18.682us 0.778us 0.000us 0.00% 0.000us 0.000us 24 - aten::empty_like 0.38% 19.848us 1.54% 80.939us 5.396us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.45% 76.001us 1.45% 76.001us 3.167us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 2.04% 106.952us 2.04% 106.952us 7.130us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty_strided 0.26% 13.850us 0.26% 13.850us 4.617us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceGetAttribute 0.04% 1.860us 0.04% 1.860us 0.310us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.07% 3.760us 0.07% 3.760us 1.253us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 58.22% 3.060ms 58.22% 3.060ms 3.060ms 0.000us 0.00% 0.000us 0.000us 1 + torch_flash_ma 4.88% 260.255us 42.26% 2.252ms 2.252ms 0.000us 0.00% 3.798ms 3.798ms 1 + torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.753ms 100.28% 3.753ms 3.753ms 1 + aten::scaled_dot_product_attention 0.49% 25.890us 3.50% 186.735us 62.245us 0.000us 0.00% 2.976ms 991.858us 3 + aten::_scaled_dot_product_flash_attention 0.33% 17.842us 3.02% 160.845us 53.615us 0.000us 0.00% 2.976ms 991.858us 3 + aten::_flash_attention_forward 0.74% 39.289us 2.26% 120.363us 40.121us 2.976ms 79.51% 2.976ms 991.858us 3 +void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.976ms 79.51% 2.976ms 991.858us 3 + aten::contiguous 0.20% 10.403us 33.03% 1.760ms 146.680us 0.000us 0.00% 822.042us 68.504us 12 + aten::clone 0.53% 28.238us 32.84% 1.750ms 145.813us 0.000us 0.00% 822.042us 68.504us 12 + aten::copy_ 1.51% 80.312us 31.12% 1.659ms 138.210us 766.874us 20.49% 822.042us 68.504us 12 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 766.874us 20.49% 766.874us 63.906us 12 + Activity Buffer Request 28.02% 1.493ms 28.02% 1.493ms 1.493ms 55.168us 1.47% 55.168us 55.168us 1 + aten::transpose 0.94% 50.313us 1.27% 67.673us 2.820us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.33% 17.360us 0.33% 17.360us 0.723us 0.000us 0.00% 0.000us 0.000us 24 + aten::empty_like 0.40% 21.528us 1.56% 83.370us 5.558us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.43% 76.263us 1.43% 76.263us 3.178us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 2.08% 110.943us 2.08% 110.943us 7.396us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty_strided 0.27% 14.621us 0.27% 14.621us 4.874us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceGetAttribute 0.03% 1.781us 0.03% 1.781us 0.297us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.08% 4.011us 0.08% 4.011us 1.337us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 57.74% 3.077ms 57.74% 3.077ms 3.077ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 5.255ms -Self CUDA time total: 3.749ms +Self CPU time total: 5.329ms +Self CUDA time total: 3.742ms @@ -4271,29 +4271,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L320_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_flash_ma 4.59% 242.054us 41.69% 2.201ms 2.201ms 0.000us 0.00% 3.795ms 3.795ms 1 - torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.746ms 100.27% 3.746ms 3.746ms 1 - aten::scaled_dot_product_attention 0.50% 26.150us 3.40% 179.413us 59.804us 0.000us 0.00% 2.957ms 985.581us 3 - aten::_scaled_dot_product_flash_attention 0.35% 18.371us 2.90% 153.263us 51.088us 0.000us 0.00% 2.957ms 985.581us 3 - aten::_flash_attention_forward 0.64% 34.041us 2.11% 111.213us 37.071us 2.957ms 79.14% 2.957ms 985.581us 3 -void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.957ms 79.14% 2.957ms 985.581us 3 - aten::contiguous 0.19% 9.991us 32.85% 1.734ms 144.489us 0.000us 0.00% 838.147us 69.846us 12 - aten::clone 0.52% 27.541us 32.66% 1.724ms 143.657us 0.000us 0.00% 838.147us 69.846us 12 - aten::copy_ 1.47% 77.641us 30.91% 1.632ms 135.987us 779.363us 20.86% 838.147us 69.846us 12 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 779.363us 20.86% 779.363us 64.947us 12 - Activity Buffer Request 27.89% 1.472ms 27.89% 1.472ms 1.472ms 58.784us 1.57% 58.784us 58.784us 1 - aten::transpose 0.96% 50.819us 1.31% 69.110us 2.880us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.35% 18.291us 0.35% 18.291us 0.762us 0.000us 0.00% 0.000us 0.000us 24 - aten::empty_like 0.38% 20.141us 1.58% 83.392us 5.559us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.49% 78.782us 1.49% 78.782us 3.283us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 1.99% 104.800us 1.99% 104.800us 6.987us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty_strided 0.27% 14.320us 0.27% 14.320us 4.773us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceGetAttribute 0.04% 1.870us 0.04% 1.870us 0.312us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.07% 3.720us 0.07% 3.720us 1.240us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 58.31% 3.078ms 58.31% 3.078ms 3.078ms 0.000us 0.00% 0.000us 0.000us 1 + torch_flash_ma 4.87% 262.676us 41.62% 2.245ms 2.245ms 0.000us 0.00% 3.882ms 3.882ms 1 + torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.834ms 100.29% 3.834ms 3.834ms 1 + aten::scaled_dot_product_attention 0.50% 26.770us 3.49% 188.015us 62.672us 0.000us 0.00% 3.044ms 1.015ms 3 + aten::_scaled_dot_product_flash_attention 0.35% 18.803us 2.99% 161.245us 53.748us 0.000us 0.00% 3.044ms 1.015ms 3 + aten::_flash_attention_forward 0.74% 39.829us 2.21% 119.102us 39.701us 3.044ms 79.61% 3.044ms 1.015ms 3 +void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.044ms 79.61% 3.044ms 1.015ms 3 + aten::contiguous 0.18% 9.451us 32.36% 1.746ms 145.465us 0.000us 0.00% 838.367us 69.864us 12 + aten::clone 0.54% 28.881us 32.18% 1.736ms 144.678us 0.000us 0.00% 838.367us 69.864us 12 + aten::copy_ 1.51% 81.201us 30.48% 1.644ms 137.016us 779.615us 20.39% 838.367us 69.864us 12 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 779.615us 20.39% 779.615us 64.968us 12 + Activity Buffer Request 27.31% 1.473ms 27.31% 1.473ms 1.473ms 58.752us 1.54% 58.752us 58.752us 1 + aten::transpose 1.01% 54.592us 1.34% 72.471us 3.020us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.33% 17.879us 0.33% 17.879us 0.745us 0.000us 0.00% 0.000us 0.000us 24 + aten::empty_like 0.37% 20.117us 1.53% 82.751us 5.517us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.41% 76.295us 1.41% 76.295us 3.179us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 2.13% 114.795us 2.13% 114.795us 7.653us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty_strided 0.27% 14.801us 0.27% 14.801us 4.934us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceGetAttribute 0.04% 2.110us 0.04% 2.110us 0.352us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.07% 3.990us 0.07% 3.990us 1.330us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 58.38% 3.149ms 58.38% 3.149ms 3.149ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 5.279ms -Self CUDA time total: 3.736ms +Self CPU time total: 5.395ms +Self CUDA time total: 3.823ms @@ -4303,29 +4303,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L384_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_flash_ma 4.47% 246.252us 42.66% 2.352ms 2.352ms 0.000us 0.00% 3.878ms 3.878ms 1 - torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.831ms 100.28% 3.831ms 3.831ms 1 - aten::scaled_dot_product_attention 0.47% 26.180us 3.22% 177.714us 59.238us 0.000us 0.00% 3.035ms 1.012ms 3 - aten::_scaled_dot_product_flash_attention 0.34% 18.934us 2.75% 151.534us 50.511us 0.000us 0.00% 3.035ms 1.012ms 3 - aten::_flash_attention_forward 0.60% 33.169us 1.99% 109.931us 36.644us 3.035ms 79.45% 3.035ms 1.012ms 3 -void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.035ms 79.45% 3.035ms 1.012ms 3 - aten::contiguous 0.19% 10.269us 34.14% 1.882ms 156.829us 0.000us 0.00% 843.264us 70.272us 12 - aten::clone 0.51% 27.861us 33.95% 1.872ms 155.974us 0.000us 0.00% 843.264us 70.272us 12 - aten::copy_ 1.39% 76.612us 32.27% 1.779ms 148.225us 785.216us 20.55% 843.264us 70.272us 12 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 785.216us 20.55% 785.216us 65.435us 12 - Activity Buffer Request 26.00% 1.433ms 26.00% 1.433ms 1.433ms 58.048us 1.52% 58.048us 58.048us 1 - aten::transpose 0.90% 49.620us 1.24% 68.282us 2.845us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.34% 18.662us 0.34% 18.662us 0.778us 0.000us 0.00% 0.000us 0.000us 24 - aten::empty_like 0.37% 20.139us 1.52% 83.911us 5.594us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.44% 79.524us 1.44% 79.524us 3.313us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 5.29% 291.664us 5.29% 291.664us 19.444us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty_strided 0.25% 13.850us 0.25% 13.850us 4.617us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceGetAttribute 0.03% 1.810us 0.03% 1.810us 0.302us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.07% 3.620us 0.07% 3.620us 1.207us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 57.34% 3.161ms 57.34% 3.161ms 3.161ms 0.000us 0.00% 0.000us 0.000us 1 + torch_flash_ma 4.61% 261.106us 43.54% 2.469ms 2.469ms 0.000us 0.00% 3.945ms 3.945ms 1 + torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.898ms 100.28% 3.898ms 3.898ms 1 + aten::scaled_dot_product_attention 0.46% 26.241us 3.40% 192.654us 64.218us 0.000us 0.00% 3.100ms 1.033ms 3 + aten::_scaled_dot_product_flash_attention 0.34% 19.509us 2.94% 166.413us 55.471us 0.000us 0.00% 3.100ms 1.033ms 3 + aten::_flash_attention_forward 0.74% 42.081us 2.16% 122.633us 40.878us 3.100ms 79.76% 3.100ms 1.033ms 3 +void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.100ms 79.76% 3.100ms 1.033ms 3 + aten::contiguous 0.20% 11.161us 34.71% 1.968ms 163.994us 0.000us 0.00% 844.704us 70.392us 12 + aten::clone 0.52% 29.682us 34.51% 1.957ms 163.064us 0.000us 0.00% 844.704us 70.392us 12 + aten::copy_ 1.45% 82.261us 32.81% 1.860ms 155.026us 786.784us 20.24% 844.704us 70.392us 12 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 786.784us 20.24% 786.784us 65.565us 12 + Activity Buffer Request 26.26% 1.489ms 26.26% 1.489ms 1.489ms 57.920us 1.49% 57.920us 57.920us 1 + aten::transpose 0.95% 53.820us 1.26% 71.322us 2.972us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.31% 17.502us 0.31% 17.502us 0.729us 0.000us 0.00% 0.000us 0.000us 24 + aten::empty_like 0.39% 21.943us 1.53% 86.983us 5.799us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.40% 79.202us 1.40% 79.202us 3.300us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 5.55% 314.487us 5.55% 314.487us 20.966us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty_strided 0.26% 14.830us 0.26% 14.830us 4.943us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceGetAttribute 0.04% 2.010us 0.04% 2.010us 0.335us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.07% 4.040us 0.07% 4.040us 1.347us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 56.46% 3.201ms 56.46% 3.201ms 3.201ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 5.512ms -Self CUDA time total: 3.820ms +Self CPU time total: 5.670ms +Self CUDA time total: 3.887ms @@ -4335,29 +4335,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L448_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_flash_ma 4.69% 283.303us 42.14% 2.547ms 2.547ms 0.000us 0.00% 4.304ms 4.304ms 1 - torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.254ms 100.24% 4.254ms 4.254ms 1 - aten::scaled_dot_product_attention 0.82% 49.722us 3.53% 213.285us 71.095us 0.000us 0.00% 3.439ms 1.146ms 3 - aten::_scaled_dot_product_flash_attention 0.34% 20.582us 2.71% 163.563us 54.521us 0.000us 0.00% 3.439ms 1.146ms 3 - aten::_flash_attention_forward 0.62% 37.231us 1.93% 116.771us 38.924us 3.439ms 81.02% 3.439ms 1.146ms 3 -void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.439ms 81.02% 3.439ms 1.146ms 3 - aten::contiguous 0.18% 10.912us 32.97% 1.993ms 166.068us 0.000us 0.00% 865.695us 72.141us 12 - aten::clone 0.50% 30.059us 32.79% 1.982ms 165.158us 0.000us 0.00% 865.695us 72.141us 12 - aten::copy_ 1.39% 83.902us 31.17% 1.884ms 157.000us 805.439us 18.98% 865.695us 72.141us 12 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 805.439us 18.98% 805.439us 67.120us 12 - Activity Buffer Request 24.08% 1.456ms 24.08% 1.456ms 1.456ms 60.256us 1.42% 60.256us 60.256us 1 - aten::transpose 1.06% 63.793us 1.39% 84.162us 3.507us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.34% 20.369us 0.34% 20.369us 0.849us 0.000us 0.00% 0.000us 0.000us 24 - aten::empty_like 0.36% 21.791us 1.46% 88.331us 5.889us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.33% 80.570us 1.33% 80.570us 3.357us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 6.09% 368.355us 6.09% 368.355us 24.557us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty_strided 0.25% 15.000us 0.25% 15.000us 5.000us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceGetAttribute 0.03% 1.990us 0.03% 1.990us 0.332us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.07% 4.160us 0.07% 4.160us 1.387us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 57.86% 3.497ms 57.86% 3.497ms 3.497ms 0.000us 0.00% 0.000us 0.000us 1 + torch_flash_ma 5.12% 312.519us 40.82% 2.493ms 2.493ms 0.000us 0.00% 4.416ms 4.416ms 1 + torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.365ms 100.24% 4.365ms 4.365ms 1 + aten::scaled_dot_product_attention 0.42% 25.922us 3.20% 195.246us 65.082us 0.000us 0.00% 3.547ms 1.182ms 3 + aten::_scaled_dot_product_flash_attention 0.34% 20.847us 2.77% 169.324us 56.441us 0.000us 0.00% 3.547ms 1.182ms 3 + aten::_flash_attention_forward 0.72% 44.243us 2.07% 126.303us 42.101us 3.547ms 81.45% 3.547ms 1.182ms 3 +void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.547ms 81.45% 3.547ms 1.182ms 3 + aten::contiguous 0.17% 10.559us 31.73% 1.938ms 161.473us 0.000us 0.00% 869.122us 72.427us 12 + aten::clone 0.47% 28.763us 31.56% 1.927ms 160.593us 0.000us 0.00% 869.122us 72.427us 12 + aten::copy_ 1.36% 83.033us 30.01% 1.832ms 152.707us 807.906us 18.55% 869.122us 72.427us 12 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 807.906us 18.55% 807.906us 67.326us 12 + Activity Buffer Request 24.51% 1.497ms 24.51% 1.497ms 1.497ms 61.216us 1.41% 61.216us 61.216us 1 + aten::transpose 0.85% 52.195us 1.14% 69.864us 2.911us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.29% 17.669us 0.29% 17.669us 0.736us 0.000us 0.00% 0.000us 0.000us 24 + aten::empty_like 0.34% 20.921us 1.44% 87.791us 5.853us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.30% 79.270us 1.30% 79.270us 3.303us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 4.55% 277.575us 4.55% 277.575us 18.505us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty_strided 0.27% 16.520us 0.27% 16.520us 5.507us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceGetAttribute 0.03% 1.960us 0.03% 1.960us 0.327us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.07% 4.040us 0.07% 4.040us 1.347us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 59.18% 3.614ms 59.18% 3.614ms 3.614ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 6.045ms -Self CUDA time total: 4.244ms +Self CPU time total: 6.107ms +Self CUDA time total: 4.355ms @@ -4367,45 +4367,39 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L512_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_flash_ma 4.04% 248.485us 39.71% 2.440ms 2.440ms 0.000us 0.00% 4.431ms 4.431ms 1 - torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.380ms 100.24% 4.380ms 4.380ms 1 - aten::scaled_dot_product_attention 0.42% 25.679us 2.90% 178.082us 59.361us 0.000us 0.00% 3.552ms 1.184ms 3 - aten::_scaled_dot_product_flash_attention 0.29% 17.912us 2.48% 152.403us 50.801us 0.000us 0.00% 3.552ms 1.184ms 3 - aten::_flash_attention_forward 0.56% 34.360us 1.81% 111.452us 37.151us 3.552ms 81.28% 3.552ms 1.184ms 3 -void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.552ms 81.28% 3.552ms 1.184ms 3 - aten::contiguous 0.17% 10.359us 32.01% 1.967ms 163.915us 0.000us 0.00% 879.392us 73.283us 12 - aten::clone 0.45% 27.371us 31.84% 1.957ms 163.052us 0.000us 0.00% 879.392us 73.283us 12 - aten::copy_ 1.33% 81.681us 30.34% 1.864ms 155.367us 818.048us 18.72% 879.392us 73.283us 12 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 818.048us 18.72% 818.048us 68.171us 12 - Activity Buffer Request 23.48% 1.443ms 23.48% 1.443ms 1.443ms 61.344us 1.40% 61.344us 61.344us 1 - aten::transpose 0.84% 51.433us 1.14% 69.901us 2.913us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.30% 18.468us 0.30% 18.468us 0.769us 0.000us 0.00% 0.000us 0.000us 24 - aten::empty_like 0.32% 19.754us 1.37% 83.993us 5.600us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.26% 77.740us 1.26% 77.740us 3.239us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 5.92% 364.005us 5.92% 364.005us 24.267us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty_strided 0.23% 14.381us 0.23% 14.381us 4.794us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceGetAttribute 0.03% 1.840us 0.03% 1.840us 0.307us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.07% 4.180us 0.07% 4.180us 1.393us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 60.29% 3.705ms 60.29% 3.705ms 3.705ms 0.000us 0.00% 0.000us 0.000us 1 + torch_flash_ma 3.85% 236.256us 38.02% 2.335ms 2.335ms 0.000us 0.00% 4.535ms 4.535ms 1 + torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.485ms 100.25% 4.485ms 4.485ms 1 + aten::scaled_dot_product_attention 0.43% 26.452us 2.98% 183.275us 61.092us 0.000us 0.00% 3.655ms 1.218ms 3 + aten::_scaled_dot_product_flash_attention 0.30% 18.620us 2.55% 156.823us 52.274us 0.000us 0.00% 3.655ms 1.218ms 3 + aten::_flash_attention_forward 0.59% 36.060us 1.88% 115.323us 38.441us 3.655ms 81.69% 3.655ms 1.218ms 3 +void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.655ms 81.69% 3.655ms 1.218ms 3 + aten::contiguous 0.16% 9.770us 30.40% 1.867ms 155.567us 0.000us 0.00% 880.065us 73.339us 12 + aten::clone 0.46% 28.179us 30.24% 1.857ms 154.753us 0.000us 0.00% 880.065us 73.339us 12 + aten::copy_ 1.36% 83.563us 28.74% 1.765ms 147.054us 819.137us 18.31% 880.065us 73.339us 12 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 819.137us 18.31% 819.137us 68.261us 12 + Activity Buffer Request 23.24% 1.427ms 23.24% 1.427ms 1.427ms 60.928us 1.36% 60.928us 60.928us 1 + aten::transpose 0.86% 52.980us 1.16% 71.060us 2.961us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.29% 18.080us 0.29% 18.080us 0.753us 0.000us 0.00% 0.000us 0.000us 24 + aten::empty_like 0.34% 20.930us 1.37% 83.913us 5.594us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.25% 77.043us 1.25% 77.043us 3.210us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 4.54% 278.990us 4.54% 278.990us 18.599us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty_strided 0.24% 14.661us 0.24% 14.661us 4.887us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceGetAttribute 0.03% 1.978us 0.03% 1.978us 0.330us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.06% 3.901us 0.06% 3.901us 1.300us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 61.98% 3.806ms 61.98% 3.806ms 3.806ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 6.146ms -Self CUDA time total: 4.370ms +Self CPU time total: 6.141ms +Self CUDA time total: 4.474ms impl wl p50(ms) ok torch_flash_ma cuda_attn_L128_bfloat16 1.22 True -torch_flash_ma cuda_attn_L256_bfloat16 1.27 True -torch_flash_ma cuda_attn_L320_bfloat16 1.29 True -torch_flash_ma cuda_attn_L384_bfloat16 1.30 True -torch_flash_ma cuda_attn_L448_bfloat16 1.45 True -torch_flash_ma cuda_attn_L512_bfloat16 1.49 True +torch_flash_ma cuda_attn_L256_bfloat16 1.28 True +torch_flash_ma cuda_attn_L320_bfloat16 1.30 True +torch_flash_ma cuda_attn_L384_bfloat16 1.33 True +torch_flash_ma cuda_attn_L448_bfloat16 1.50 True +torch_flash_ma cuda_attn_L512_bfloat16 1.51 True
-
-
▶ UV Install Logs
- -

Artifacts:

attention.jsonl diff --git a/flash_attn/impls/hf_kernels_flash_attn.html b/flash_attn/impls/hf_kernels_flash_attn.html index 0c6eeb07699e5badcea2a599fa3141678ce81b07..b43f3b2c4b9504821051f29d094124c270a7e0ee 100644 --- a/flash_attn/impls/hf_kernels_flash_attn.html +++ b/flash_attn/impls/hf_kernels_flash_attn.html @@ -4104,14 +4104,14 @@ body[data-tool="eraser"] .main-content { ▼ code ▼ output - ▶ uv-logs + ▶ uv-logs | -Cell: benchmark | 10.91s +Cell: benchmark | 5.83s | Raw -GitHub -🤗 HF +GitHub +🤗 HF
@@ -4161,21 +4161,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L128_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn 3.74% 162.312us 41.68% 1.808ms 1.808ms 0.000us 0.00% 3.686ms 3.686ms 1 - _flash_attn_9e27194::fwd 1.67% 72.360us 37.94% 1.646ms 548.560us 2.753ms 100.00% 3.686ms 1.229ms 3 - hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.754ms 100.05% 2.754ms 2.754ms 1 -void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.753ms 100.00% 2.753ms 917.639us 3 - Activity Buffer Request 33.08% 1.435ms 33.08% 1.435ms 1.435ms 933.501us 33.91% 933.501us 933.501us 1 - cudaDeviceGetAttribute 0.12% 5.209us 0.12% 5.209us 0.347us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty_like 0.42% 18.210us 1.24% 53.790us 17.930us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 0.82% 35.580us 0.82% 35.580us 11.860us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty 0.58% 25.153us 0.58% 25.153us 2.795us 0.000us 0.00% 0.000us 0.000us 9 - cudaFuncSetAttribute 0.26% 11.441us 0.26% 11.441us 3.814us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 0.99% 42.781us 0.99% 42.781us 14.260us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 58.32% 2.530ms 58.32% 2.530ms 2.530ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn 3.51% 153.413us 41.11% 1.797ms 1.797ms 0.000us 0.00% 3.733ms 3.733ms 1 + _flash_attn_9e27194::fwd 1.62% 70.702us 37.60% 1.644ms 547.894us 2.785ms 100.00% 3.733ms 1.244ms 3 + hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.786ms 100.05% 2.786ms 2.786ms 1 +void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.785ms 100.00% 2.785ms 928.303us 3 + Activity Buffer Request 32.92% 1.439ms 32.92% 1.439ms 1.439ms 947.706us 34.03% 947.706us 947.706us 1 + cudaDeviceGetAttribute 0.11% 4.891us 0.11% 4.891us 0.326us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty_like 0.37% 16.181us 1.17% 51.061us 17.020us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 0.80% 34.880us 0.80% 34.880us 11.627us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty 0.59% 25.681us 0.59% 25.681us 2.853us 0.000us 0.00% 0.000us 0.000us 9 + cudaFuncSetAttribute 0.26% 11.340us 0.26% 11.340us 3.780us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.93% 40.731us 0.93% 40.731us 13.577us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 58.89% 2.575ms 58.89% 2.575ms 2.575ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.338ms -Self CUDA time total: 2.753ms +Self CPU time total: 4.372ms +Self CUDA time total: 2.785ms @@ -4185,21 +4185,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L256_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn 2.52% 113.464us 37.14% 1.670ms 1.670ms 0.000us 0.00% 3.984ms 3.984ms 1 - _flash_attn_9e27194::fwd 1.10% 49.632us 34.61% 1.557ms 518.855us 2.977ms 100.00% 3.984ms 1.328ms 3 - hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.979ms 100.05% 2.979ms 2.979ms 1 -void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.977ms 100.00% 2.977ms 992.348us 3 - Activity Buffer Request 31.69% 1.425ms 31.69% 1.425ms 1.425ms 1.007ms 33.82% 1.007ms 1.007ms 1 - cudaDeviceGetAttribute 0.08% 3.769us 0.08% 3.769us 0.251us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty_like 0.17% 7.560us 0.54% 24.080us 8.027us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 0.37% 16.520us 0.37% 16.520us 5.507us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty 0.47% 21.170us 0.47% 21.170us 2.352us 0.000us 0.00% 0.000us 0.000us 9 - cudaFuncSetAttribute 0.08% 3.820us 0.08% 3.820us 1.273us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 0.64% 28.910us 0.64% 28.910us 9.637us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 62.86% 2.827ms 62.86% 2.827ms 2.827ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn 1.94% 86.682us 37.50% 1.676ms 1.676ms 0.000us 0.00% 3.929ms 3.929ms 1 + _flash_attn_9e27194::fwd 1.06% 47.570us 35.56% 1.589ms 529.734us 2.938ms 100.00% 3.929ms 1.310ms 3 + hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.939ms 100.05% 2.939ms 2.939ms 1 +void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.938ms 100.00% 2.938ms 979.209us 3 + Activity Buffer Request 32.66% 1.460ms 32.66% 1.460ms 1.460ms 991.166us 33.74% 991.166us 991.166us 1 + cudaDeviceGetAttribute 0.10% 4.450us 0.10% 4.450us 0.297us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty_like 0.19% 8.440us 0.55% 24.690us 8.230us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 0.36% 16.250us 0.36% 16.250us 5.417us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty 0.51% 22.872us 0.51% 22.872us 2.541us 0.000us 0.00% 0.000us 0.000us 9 + cudaFuncSetAttribute 0.07% 3.350us 0.07% 3.350us 1.117us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.60% 26.611us 0.60% 26.611us 8.870us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 62.50% 2.794ms 62.50% 2.794ms 2.794ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.497ms -Self CUDA time total: 2.977ms +Self CPU time total: 4.469ms +Self CUDA time total: 2.938ms @@ -4209,21 +4209,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L320_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn 2.39% 108.133us 36.58% 1.655ms 1.655ms 0.000us 0.00% 4.040ms 4.040ms 1 - _flash_attn_9e27194::fwd 1.06% 48.029us 34.19% 1.547ms 515.608us 3.016ms 100.00% 4.040ms 1.347ms 3 - hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.017ms 100.05% 3.017ms 3.017ms 1 -void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.016ms 100.00% 3.016ms 1.005ms 3 - Activity Buffer Request 31.28% 1.415ms 31.28% 1.415ms 1.415ms 1.024ms 33.96% 1.024ms 1.024ms 1 - cudaDeviceGetAttribute 0.09% 4.281us 0.09% 4.281us 0.285us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty_like 0.16% 7.121us 0.52% 23.411us 7.804us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 0.36% 16.290us 0.36% 16.290us 5.430us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty 0.49% 22.080us 0.49% 22.080us 2.453us 0.000us 0.00% 0.000us 0.000us 9 - cudaFuncSetAttribute 0.08% 3.840us 0.08% 3.840us 1.280us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 0.66% 29.710us 0.66% 29.710us 9.903us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 63.42% 2.870ms 63.42% 2.870ms 2.870ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn 2.38% 109.313us 36.70% 1.683ms 1.683ms 0.000us 0.00% 4.081ms 4.081ms 1 + _flash_attn_9e27194::fwd 1.05% 48.167us 34.31% 1.574ms 524.567us 3.048ms 100.00% 4.081ms 1.360ms 3 + hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.049ms 100.05% 3.049ms 3.049ms 1 +void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.048ms 100.00% 3.048ms 1.016ms 3 + Activity Buffer Request 31.46% 1.443ms 31.46% 1.443ms 1.443ms 1.033ms 33.90% 1.033ms 1.033ms 1 + cudaDeviceGetAttribute 0.09% 4.231us 0.09% 4.231us 0.282us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty_like 0.16% 7.250us 0.52% 23.960us 7.987us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 0.36% 16.710us 0.36% 16.710us 5.570us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty 0.46% 21.300us 0.46% 21.300us 2.367us 0.000us 0.00% 0.000us 0.000us 9 + cudaFuncSetAttribute 0.08% 3.561us 0.08% 3.561us 1.187us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.64% 29.473us 0.64% 29.473us 9.824us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 63.30% 2.903ms 63.30% 2.903ms 2.903ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.525ms -Self CUDA time total: 3.016ms +Self CPU time total: 4.586ms +Self CUDA time total: 3.048ms @@ -4233,21 +4233,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L384_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn 2.32% 109.992us 39.04% 1.848ms 1.848ms 0.000us 0.00% 4.060ms 4.060ms 1 - _flash_attn_9e27194::fwd 1.05% 49.564us 36.71% 1.738ms 579.317us 3.035ms 100.00% 4.060ms 1.353ms 3 - hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.037ms 100.05% 3.037ms 3.037ms 1 -void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.035ms 100.00% 3.035ms 1.012ms 3 - Activity Buffer Request 29.72% 1.407ms 29.72% 1.407ms 1.407ms 1.025ms 33.76% 1.025ms 1.025ms 1 - cudaDeviceGetAttribute 0.08% 3.690us 0.08% 3.690us 0.246us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty_like 0.16% 7.770us 0.54% 25.380us 8.460us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 0.37% 17.610us 0.37% 17.610us 5.870us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty 0.47% 22.139us 0.47% 22.139us 2.460us 0.000us 0.00% 0.000us 0.000us 9 - cudaFuncSetAttribute 0.08% 3.790us 0.08% 3.790us 1.263us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 4.78% 226.343us 4.78% 226.343us 75.448us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 60.96% 2.886ms 60.96% 2.886ms 2.886ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn 2.13% 103.094us 38.83% 1.884ms 1.884ms 0.000us 0.00% 4.165ms 4.165ms 1 + _flash_attn_9e27194::fwd 0.99% 47.838us 36.71% 1.781ms 593.521us 3.114ms 100.00% 4.165ms 1.388ms 3 + hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.116ms 100.05% 3.116ms 3.116ms 1 +void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.114ms 100.00% 3.114ms 1.038ms 3 + Activity Buffer Request 29.59% 1.435ms 29.59% 1.435ms 1.435ms 1.051ms 33.75% 1.051ms 1.051ms 1 + cudaDeviceGetAttribute 0.08% 3.800us 0.08% 3.800us 0.253us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty_like 0.16% 7.891us 0.53% 25.811us 8.604us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 0.37% 17.920us 0.37% 17.920us 5.973us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty 0.45% 21.731us 0.45% 21.731us 2.415us 0.000us 0.00% 0.000us 0.000us 9 + cudaFuncSetAttribute 0.08% 3.740us 0.08% 3.740us 1.247us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 4.99% 242.187us 4.99% 242.187us 80.729us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 61.17% 2.967ms 61.17% 2.967ms 2.967ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.734ms -Self CUDA time total: 3.035ms +Self CPU time total: 4.851ms +Self CUDA time total: 3.114ms @@ -4257,21 +4257,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L448_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn 2.11% 110.542us 35.45% 1.860ms 1.860ms 0.000us 0.00% 4.719ms 4.719ms 1 - _flash_attn_9e27194::fwd 0.97% 51.080us 33.34% 1.750ms 583.220us 3.535ms 100.00% 4.719ms 1.573ms 3 - hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.537ms 100.04% 3.537ms 3.537ms 1 -void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.535ms 100.00% 3.535ms 1.178ms 3 - Activity Buffer Request 27.95% 1.467ms 27.95% 1.467ms 1.467ms 1.184ms 33.49% 1.184ms 1.184ms 1 - cudaDeviceGetAttribute 0.07% 3.640us 0.07% 3.640us 0.243us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty_like 0.14% 7.520us 0.47% 24.731us 8.244us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 0.33% 17.211us 0.33% 17.211us 5.737us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty 0.43% 22.670us 0.43% 22.670us 2.519us 0.000us 0.00% 0.000us 0.000us 9 - cudaFuncSetAttribute 0.07% 3.800us 0.07% 3.800us 1.267us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 3.37% 176.824us 3.37% 176.824us 58.941us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 64.55% 3.388ms 64.55% 3.388ms 3.388ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn 2.00% 105.522us 34.61% 1.828ms 1.828ms 0.000us 0.00% 4.806ms 4.806ms 1 + _flash_attn_9e27194::fwd 0.94% 49.622us 32.62% 1.723ms 574.192us 3.597ms 100.00% 4.806ms 1.602ms 3 + hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.599ms 100.05% 3.599ms 3.599ms 1 +void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.597ms 100.00% 3.597ms 1.199ms 3 + Activity Buffer Request 27.37% 1.446ms 27.37% 1.446ms 1.446ms 1.209ms 33.59% 1.209ms 1.209ms 1 + cudaDeviceGetAttribute 0.08% 3.991us 0.08% 3.991us 0.266us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty_like 0.14% 7.250us 0.47% 24.620us 8.207us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 0.33% 17.370us 0.33% 17.370us 5.790us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty 0.41% 21.681us 0.41% 21.681us 2.409us 0.000us 0.00% 0.000us 0.000us 9 + cudaFuncSetAttribute 0.07% 3.770us 0.07% 3.770us 1.257us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 3.28% 173.384us 3.28% 173.384us 57.795us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 65.39% 3.453ms 65.39% 3.453ms 3.453ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 5.248ms -Self CUDA time total: 3.535ms +Self CPU time total: 5.281ms +Self CUDA time total: 3.597ms @@ -4281,41 +4281,36 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L512_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn 2.24% 118.861us 34.58% 1.832ms 1.832ms 0.000us 0.00% 4.834ms 4.834ms 1 - _flash_attn_9e27194::fwd 0.90% 47.900us 32.34% 1.713ms 571.163us 3.618ms 100.00% 4.834ms 1.611ms 3 - hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.619ms 100.04% 3.619ms 3.619ms 1 -void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.618ms 100.00% 3.618ms 1.206ms 3 - Activity Buffer Request 27.32% 1.448ms 27.32% 1.448ms 1.448ms 1.217ms 33.63% 1.217ms 1.217ms 1 - cudaDeviceGetAttribute 0.07% 3.661us 0.07% 3.661us 0.244us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty_like 0.14% 7.320us 0.50% 26.231us 8.744us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty_strided 0.36% 18.911us 0.36% 18.911us 6.304us 0.000us 0.00% 0.000us 0.000us 3 - aten::empty 0.40% 21.351us 0.40% 21.351us 2.372us 0.000us 0.00% 0.000us 0.000us 9 - cudaFuncSetAttribute 0.08% 4.160us 0.08% 4.160us 1.387us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 3.07% 162.463us 3.07% 162.463us 54.154us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 65.42% 3.466ms 65.42% 3.466ms 3.466ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn 2.02% 107.892us 33.82% 1.810ms 1.810ms 0.000us 0.00% 4.930ms 4.930ms 1 + _flash_attn_9e27194::fwd 0.91% 48.918us 31.80% 1.702ms 567.268us 3.687ms 100.00% 4.930ms 1.643ms 3 + hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.689ms 100.04% 3.689ms 3.689ms 1 +void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.687ms 100.00% 3.687ms 1.229ms 3 + Activity Buffer Request 26.86% 1.437ms 26.86% 1.437ms 1.437ms 1.242ms 33.69% 1.242ms 1.242ms 1 + cudaDeviceGetAttribute 0.07% 3.881us 0.07% 3.881us 0.259us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty_like 0.14% 7.591us 0.49% 26.111us 8.704us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 0.35% 18.520us 0.35% 18.520us 6.173us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty 0.39% 20.640us 0.39% 20.640us 2.293us 0.000us 0.00% 0.000us 0.000us 9 + cudaFuncSetAttribute 0.07% 3.561us 0.07% 3.561us 1.187us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 3.01% 161.306us 3.01% 161.306us 53.769us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 66.18% 3.542ms 66.18% 3.542ms 3.542ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 5.299ms -Self CUDA time total: 3.618ms +Self CPU time total: 5.351ms +Self CUDA time total: 3.687ms impl wl p50(ms) ok -hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.94 True -hf_kernels_flash_attn cuda_attn_L256_bfloat16 0.99 True -hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.03 True -hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.05 True -hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.21 True +hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.95 True +hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.00 True +hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.05 True +hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.06 True +hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.23 True hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.23 True
-
-
▶ UV Install Logs
- -
Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s] -Fetching 20 files: 5%|▌ | 1/20 [00:00<00:02, 8.29it/s] -Fetching 20 files: 10%|█ | 2/20 [00:06<01:08, 3.82s/it] -Fetching 20 files: 100%|██████████| 20/20 [00:06<00:00, 3.06it/s]

Artifacts:

attention.jsonl diff --git a/flash_attn/impls/hf_kernels_flash_attn3.html b/flash_attn/impls/hf_kernels_flash_attn3.html index 16d419ea57e2fe2c3ccff8a3a3f19df88ec10363..a1db1794336426cb37d9956eacf119e09a093fa1 100644 --- a/flash_attn/impls/hf_kernels_flash_attn3.html +++ b/flash_attn/impls/hf_kernels_flash_attn3.html @@ -4106,11 +4106,11 @@ body[data-tool="eraser"] .main-content { ▼ output ▶ uv-logs | -Cell: benchmark | 5.55s +Cell: benchmark | 5.53s | Raw -GitHub +GitHub 🤗 HF
@@ -4160,19 +4160,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L128_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn3 4.02% 170.054us 45.66% 1.931ms 1.931ms 0.000us 0.00% 3.489ms 3.489ms 1 - FlashAttnFunc 2.98% 126.112us 41.64% 1.761ms 586.890us 0.000us 0.00% 3.489ms 1.163ms 3 - _flash_attn3_48fe103_dirty::fwd 1.85% 78.440us 38.65% 1.635ms 544.853us 2.605ms 100.00% 3.489ms 1.163ms 3 - hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.606ms 100.06% 2.606ms 2.606ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.605ms 100.00% 2.605ms 868.221us 3 - Activity Buffer Request 34.45% 1.457ms 34.45% 1.457ms 1.457ms 884.680us 33.97% 884.680us 884.680us 1 - aten::empty 1.07% 45.402us 1.07% 45.402us 7.567us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.29% 12.202us 0.29% 12.202us 4.067us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 0.99% 41.761us 0.99% 41.761us 13.920us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 54.34% 2.298ms 54.34% 2.298ms 2.298ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn3 3.85% 171.193us 46.01% 2.045ms 2.045ms 0.000us 0.00% 3.614ms 3.614ms 1 + FlashAttnFunc 3.07% 136.295us 42.15% 1.874ms 624.570us 0.000us 0.00% 3.614ms 1.205ms 3 + _flash_attn3_48fe103_dirty::fwd 1.94% 86.341us 39.09% 1.737ms 579.138us 2.720ms 100.00% 3.614ms 1.205ms 3 + hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.722ms 100.05% 2.722ms 2.722ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.720ms 100.00% 2.720ms 906.698us 3 + Activity Buffer Request 34.72% 1.543ms 34.72% 1.543ms 1.543ms 893.600us 32.85% 893.600us 893.600us 1 + aten::empty 1.07% 47.441us 1.07% 47.441us 7.907us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.31% 13.761us 0.31% 13.761us 4.587us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 1.05% 46.772us 1.05% 46.772us 15.591us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 53.99% 2.400ms 53.99% 2.400ms 2.400ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.229ms -Self CUDA time total: 2.605ms +Self CPU time total: 4.445ms +Self CUDA time total: 2.720ms @@ -4182,19 +4182,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L256_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn3 2.90% 125.133us 41.34% 1.782ms 1.782ms 0.000us 0.00% 3.684ms 3.684ms 1 - FlashAttnFunc 2.10% 90.312us 38.43% 1.657ms 552.206us 0.000us 0.00% 3.684ms 1.228ms 3 - _flash_attn3_48fe103_dirty::fwd 1.24% 53.461us 36.34% 1.566ms 522.102us 2.755ms 100.00% 3.684ms 1.228ms 3 - hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.756ms 100.06% 2.756ms 2.756ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.755ms 100.00% 2.755ms 918.309us 3 - Activity Buffer Request 33.60% 1.448ms 33.60% 1.448ms 1.448ms 929.157us 33.73% 929.157us 929.157us 1 - aten::empty 0.64% 27.380us 0.64% 27.380us 4.563us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.13% 5.449us 0.13% 5.449us 1.816us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 0.74% 31.802us 0.74% 31.802us 10.601us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 58.66% 2.529ms 58.66% 2.529ms 2.529ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn3 2.41% 104.370us 41.13% 1.784ms 1.784ms 0.000us 0.00% 3.700ms 3.700ms 1 + FlashAttnFunc 2.00% 86.685us 38.73% 1.679ms 559.738us 0.000us 0.00% 3.700ms 1.233ms 3 + _flash_attn3_48fe103_dirty::fwd 1.21% 52.631us 36.73% 1.593ms 530.843us 2.768ms 100.00% 3.700ms 1.233ms 3 + hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.769ms 100.06% 2.769ms 2.769ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.768ms 100.00% 2.768ms 922.559us 3 + Activity Buffer Request 34.10% 1.479ms 34.10% 1.479ms 1.479ms 932.127us 33.68% 932.127us 932.127us 1 + aten::empty 0.60% 25.981us 0.60% 25.981us 4.330us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.12% 5.050us 0.12% 5.050us 1.683us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.70% 30.140us 0.70% 30.140us 10.047us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 58.87% 2.553ms 58.87% 2.553ms 2.553ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.310ms -Self CUDA time total: 2.755ms +Self CPU time total: 4.336ms +Self CUDA time total: 2.768ms @@ -4204,19 +4204,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L320_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn3 2.81% 125.615us 39.44% 1.762ms 1.762ms 0.000us 0.00% 3.917ms 3.917ms 1 - FlashAttnFunc 2.03% 90.880us 36.63% 1.637ms 545.546us 0.000us 0.00% 3.917ms 1.306ms 3 - _flash_attn3_48fe103_dirty::fwd 1.20% 53.572us 34.59% 1.546ms 515.252us 2.927ms 100.00% 3.917ms 1.306ms 3 - hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.928ms 100.05% 2.928ms 2.928ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.927ms 100.00% 2.927ms 975.593us 3 - Activity Buffer Request 31.96% 1.428ms 31.96% 1.428ms 1.428ms 990.441us 33.84% 990.441us 990.441us 1 - aten::empty 0.63% 27.950us 0.63% 27.950us 4.658us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.12% 5.340us 0.12% 5.340us 1.780us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 0.68% 30.562us 0.68% 30.562us 10.187us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 60.56% 2.706ms 60.56% 2.706ms 2.706ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn3 2.29% 102.411us 40.10% 1.791ms 1.791ms 0.000us 0.00% 3.875ms 3.875ms 1 + FlashAttnFunc 2.01% 89.903us 37.81% 1.688ms 562.801us 0.000us 0.00% 3.875ms 1.292ms 3 + _flash_attn3_48fe103_dirty::fwd 1.18% 52.613us 35.79% 1.599ms 532.834us 2.892ms 100.00% 3.875ms 1.292ms 3 + hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.893ms 100.05% 2.893ms 2.893ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.892ms 100.00% 2.892ms 963.972us 3 + Activity Buffer Request 33.24% 1.485ms 33.24% 1.485ms 1.485ms 983.097us 33.99% 983.097us 983.097us 1 + aten::empty 0.58% 25.770us 0.58% 25.770us 4.295us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.11% 4.820us 0.11% 4.820us 1.607us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.69% 30.740us 0.69% 30.740us 10.247us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 59.90% 2.675ms 59.90% 2.675ms 2.675ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.469ms -Self CUDA time total: 2.927ms +Self CPU time total: 4.466ms +Self CUDA time total: 2.892ms @@ -4226,19 +4226,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L384_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn3 2.73% 126.513us 42.04% 1.948ms 1.948ms 0.000us 0.00% 3.892ms 3.892ms 1 - FlashAttnFunc 2.03% 94.184us 39.31% 1.821ms 607.134us 0.000us 0.00% 3.892ms 1.297ms 3 - _flash_attn3_48fe103_dirty::fwd 1.14% 52.959us 37.28% 1.727ms 575.740us 2.906ms 100.00% 3.892ms 1.297ms 3 - hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.908ms 100.05% 2.908ms 2.908ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.906ms 100.00% 2.906ms 968.728us 3 - Activity Buffer Request 30.69% 1.422ms 30.69% 1.422ms 1.422ms 985.540us 33.91% 985.540us 985.540us 1 - aten::empty 0.63% 29.361us 0.63% 29.361us 4.893us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.11% 5.241us 0.11% 5.241us 1.747us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 4.70% 217.965us 4.70% 217.965us 72.655us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 57.96% 2.685ms 57.96% 2.685ms 2.685ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn3 2.68% 125.944us 42.11% 1.982ms 1.982ms 0.000us 0.00% 3.932ms 3.932ms 1 + FlashAttnFunc 1.98% 92.983us 39.44% 1.856ms 618.639us 0.000us 0.00% 3.932ms 1.311ms 3 + _flash_attn3_48fe103_dirty::fwd 1.14% 53.661us 37.46% 1.763ms 587.645us 2.953ms 100.00% 3.932ms 1.311ms 3 + hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.954ms 100.06% 2.954ms 2.954ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.953ms 100.00% 2.953ms 984.176us 3 + Activity Buffer Request 30.48% 1.434ms 30.48% 1.434ms 1.434ms 979.803us 33.19% 979.803us 979.803us 1 + aten::empty 0.58% 27.450us 0.58% 27.450us 4.575us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.11% 5.150us 0.11% 5.150us 1.717us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 5.15% 242.396us 5.15% 242.396us 80.799us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 57.89% 2.724ms 57.89% 2.724ms 2.724ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.633ms -Self CUDA time total: 2.906ms +Self CPU time total: 4.706ms +Self CUDA time total: 2.953ms @@ -4248,19 +4248,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L448_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn3 2.33% 120.764us 37.09% 1.922ms 1.922ms 0.000us 0.00% 4.645ms 4.645ms 1 - FlashAttnFunc 1.78% 92.240us 34.76% 1.801ms 600.384us 0.000us 0.00% 4.645ms 1.548ms 3 - _flash_attn3_48fe103_dirty::fwd 1.04% 53.829us 32.98% 1.709ms 569.637us 3.482ms 100.00% 4.645ms 1.548ms 3 - hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.483ms 100.04% 3.483ms 3.483ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.482ms 100.00% 3.482ms 1.161ms 3 - Activity Buffer Request 27.80% 1.441ms 27.80% 1.441ms 1.441ms 1.163ms 33.40% 1.163ms 1.163ms 1 - aten::empty 0.54% 28.012us 0.54% 28.012us 4.669us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.10% 5.211us 0.10% 5.211us 1.737us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 3.50% 181.305us 3.50% 181.305us 60.435us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 62.91% 3.260ms 62.91% 3.260ms 3.260ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_flash_attn3 2.36% 122.892us 37.59% 1.960ms 1.960ms 0.000us 0.00% 4.622ms 4.622ms 1 + FlashAttnFunc 1.74% 90.533us 35.23% 1.837ms 612.429us 0.000us 0.00% 4.622ms 1.541ms 3 + _flash_attn3_48fe103_dirty::fwd 0.97% 50.750us 33.49% 1.747ms 582.252us 3.470ms 100.00% 4.622ms 1.541ms 3 + hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.472ms 100.05% 3.472ms 3.472ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.470ms 100.00% 3.470ms 1.157ms 3 + Activity Buffer Request 27.49% 1.433ms 27.49% 1.433ms 1.433ms 1.152ms 33.20% 1.152ms 1.152ms 1 + aten::empty 0.51% 26.592us 0.51% 26.592us 4.432us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.10% 5.060us 0.10% 5.060us 1.687us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 4.43% 230.856us 4.43% 230.856us 76.952us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 62.41% 3.255ms 62.41% 3.255ms 3.255ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 5.182ms -Self CUDA time total: 3.482ms +Self CPU time total: 5.215ms +Self CUDA time total: 3.470ms @@ -4270,33 +4270,33 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L512_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_flash_attn3 2.54% 130.883us 37.28% 1.924ms 1.924ms 0.000us 0.00% 4.633ms 4.633ms 1 - FlashAttnFunc 1.80% 93.033us 34.74% 1.793ms 597.564us 0.000us 0.00% 4.633ms 1.544ms 3 - _flash_attn3_48fe103_dirty::fwd 1.02% 52.583us 32.94% 1.700ms 566.553us 3.468ms 100.00% 4.633ms 1.544ms 3 - hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.469ms 100.04% 3.469ms 3.469ms 1 + hf_kernels_flash_attn3 2.32% 120.892us 37.51% 1.951ms 1.951ms 0.000us 0.00% 4.639ms 4.639ms 1 + FlashAttnFunc 1.74% 90.773us 35.18% 1.830ms 610.133us 0.000us 0.00% 4.639ms 1.546ms 3 + _flash_attn3_48fe103_dirty::fwd 0.99% 51.351us 33.44% 1.740ms 579.875us 3.468ms 100.00% 4.639ms 1.546ms 3 + hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.469ms 100.05% 3.469ms 3.469ms 1 void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.468ms 100.00% 3.468ms 1.156ms 3 - Activity Buffer Request 27.99% 1.444ms 27.99% 1.444ms 1.444ms 1.165ms 33.61% 1.165ms 1.165ms 1 - aten::empty 0.56% 29.150us 0.56% 29.150us 4.858us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.10% 5.050us 0.10% 5.050us 1.683us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 3.27% 168.763us 3.27% 168.763us 56.254us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 62.72% 3.236ms 62.72% 3.236ms 3.236ms 0.000us 0.00% 0.000us 0.000us 1 + Activity Buffer Request 27.26% 1.418ms 27.26% 1.418ms 1.418ms 1.172ms 33.79% 1.172ms 1.172ms 1 + aten::empty 0.51% 26.560us 0.51% 26.560us 4.427us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.10% 5.101us 0.10% 5.101us 1.700us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 4.58% 238.367us 4.58% 238.367us 79.456us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 62.49% 3.251ms 62.49% 3.251ms 3.251ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 5.160ms +Self CPU time total: 5.202ms Self CUDA time total: 3.468ms impl wl p50(ms) ok -hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.91 True -hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.95 True -hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.02 True -hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.02 True -hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.18 True +hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.92 True +hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.96 True +hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.01 True +hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.03 True +hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.20 True hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.18 True
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s] -Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.35it/s] -Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.71it/s] +Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.42it/s] +Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.84it/s]

Artifacts:

diff --git a/flash_attn/impls/mem_efficient_attention.html b/flash_attn/impls/mem_efficient_attention.html index a146d1ecfde534d0841c299486870e29ea70f3bb..e6d938b9f4ce572baa96778a2f0d11d329ead530 100644 --- a/flash_attn/impls/mem_efficient_attention.html +++ b/flash_attn/impls/mem_efficient_attention.html @@ -4110,7 +4110,7 @@ Cell: benchmark | 3.94s | Raw -GitHub +GitHub
@@ -4159,28 +4159,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L128_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_mem_eff 5.20% 361.468us 33.36% 2.319ms 2.319ms 0.000us 0.00% 5.387ms 5.387ms 1 - torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.370ms 100.63% 5.370ms 5.370ms 1 - aten::scaled_dot_product_attention 0.48% 33.240us 2.68% 186.333us 62.111us 0.000us 0.00% 4.719ms 1.573ms 3 - aten::_scaled_dot_product_efficient_attention 0.35% 24.389us 2.20% 153.093us 51.031us 0.000us 0.00% 4.719ms 1.573ms 3 - aten::_efficient_attention_forward 0.53% 37.120us 1.50% 104.111us 34.704us 4.719ms 88.44% 4.719ms 1.573ms 3 -fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.719ms 88.44% 4.719ms 1.573ms 3 - aten::contiguous 0.18% 12.841us 24.53% 1.706ms 189.522us 0.000us 0.00% 667.809us 74.201us 9 - aten::clone 0.46% 31.899us 24.35% 1.693ms 188.095us 0.000us 0.00% 667.809us 74.201us 9 - aten::copy_ 1.13% 78.352us 22.86% 1.589ms 176.604us 617.121us 11.56% 667.809us 74.201us 9 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 617.121us 11.56% 617.121us 68.569us 9 - Activity Buffer Request 20.52% 1.427ms 20.52% 1.427ms 1.427ms 50.688us 0.95% 50.688us 50.688us 1 - aten::transpose 0.98% 68.237us 1.30% 90.074us 3.753us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.31% 21.837us 0.31% 21.837us 0.910us 0.000us 0.00% 0.000us 0.000us 24 - aten::empty_like 0.25% 17.541us 1.03% 71.521us 7.947us 0.000us 0.00% 0.000us 0.000us 9 - aten::empty 1.19% 82.429us 1.19% 82.429us 3.925us 0.000us 0.00% 0.000us 0.000us 21 - cudaLaunchKernel 1.61% 111.770us 1.61% 111.770us 9.314us 0.000us 0.00% 0.000us 0.000us 12 - cudaStreamIsCapturing 0.05% 3.512us 0.05% 3.512us 1.171us 0.000us 0.00% 0.000us 0.000us 3 - cudaFuncSetAttribute 0.11% 7.660us 0.11% 7.660us 2.553us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 66.64% 4.633ms 66.64% 4.633ms 4.633ms 0.000us 0.00% 0.000us 0.000us 1 + torch_mem_eff 5.14% 365.276us 32.53% 2.313ms 2.313ms 0.000us 0.00% 5.511ms 5.511ms 1 + torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.492ms 100.58% 5.492ms 5.492ms 1 + aten::scaled_dot_product_attention 0.43% 30.401us 2.47% 175.534us 58.511us 0.000us 0.00% 4.841ms 1.614ms 3 + aten::_scaled_dot_product_efficient_attention 0.33% 23.489us 2.04% 145.133us 48.378us 0.000us 0.00% 4.841ms 1.614ms 3 + aten::_efficient_attention_forward 0.51% 36.572us 1.40% 99.733us 33.244us 4.841ms 88.65% 4.841ms 1.614ms 3 +fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.841ms 88.65% 4.841ms 1.614ms 3 + aten::contiguous 0.18% 12.851us 23.99% 1.706ms 189.523us 0.000us 0.00% 670.241us 74.471us 9 + aten::clone 0.46% 32.742us 23.80% 1.693ms 188.095us 0.000us 0.00% 670.241us 74.471us 9 + aten::copy_ 1.05% 74.801us 22.33% 1.588ms 176.415us 619.776us 11.35% 670.241us 74.471us 9 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 619.776us 11.35% 619.776us 68.864us 9 + Activity Buffer Request 20.17% 1.434ms 20.17% 1.434ms 1.434ms 50.465us 0.92% 50.465us 50.465us 1 + aten::transpose 0.93% 66.224us 1.25% 88.644us 3.693us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.32% 22.420us 0.32% 22.420us 0.934us 0.000us 0.00% 0.000us 0.000us 24 + aten::empty_like 0.25% 17.919us 1.02% 72.382us 8.042us 0.000us 0.00% 0.000us 0.000us 9 + aten::empty 1.14% 81.114us 1.14% 81.114us 3.863us 0.000us 0.00% 0.000us 0.000us 21 + cudaLaunchKernel 1.46% 103.973us 1.46% 103.973us 8.664us 0.000us 0.00% 0.000us 0.000us 12 + cudaStreamIsCapturing 0.04% 2.960us 0.04% 2.960us 0.987us 0.000us 0.00% 0.000us 0.000us 3 + cudaFuncSetAttribute 0.12% 8.310us 0.12% 8.310us 2.770us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 67.47% 4.798ms 67.47% 4.798ms 4.798ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 6.952ms -Self CUDA time total: 5.336ms +Self CPU time total: 7.111ms +Self CUDA time total: 5.460ms @@ -4190,28 +4190,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L256_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_mem_eff 3.61% 259.378us 29.44% 2.116ms 2.116ms 0.000us 0.00% 5.734ms 5.734ms 1 - torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.688ms 100.15% 5.688ms 5.688ms 1 - aten::scaled_dot_product_attention 0.27% 19.560us 2.06% 147.832us 49.277us 0.000us 0.00% 5.042ms 1.681ms 3 - aten::_scaled_dot_product_efficient_attention 0.27% 19.340us 1.78% 128.272us 42.757us 0.000us 0.00% 5.042ms 1.681ms 3 - aten::_efficient_attention_forward 0.39% 28.380us 1.18% 84.990us 28.330us 5.042ms 88.79% 5.042ms 1.681ms 3 -fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.042ms 88.79% 5.042ms 1.681ms 3 - aten::contiguous 0.11% 8.118us 23.11% 1.661ms 184.525us 0.000us 0.00% 691.453us 76.828us 9 - aten::clone 0.32% 22.761us 23.00% 1.653ms 183.623us 0.000us 0.00% 691.453us 76.828us 9 - aten::copy_ 0.95% 68.519us 21.65% 1.556ms 172.887us 636.925us 11.21% 691.453us 76.828us 9 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 636.925us 11.21% 636.925us 70.769us 9 - Activity Buffer Request 19.69% 1.415ms 19.69% 1.415ms 1.415ms 54.528us 0.96% 54.528us 54.528us 1 - aten::transpose 0.75% 54.034us 1.00% 71.792us 2.991us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.25% 17.758us 0.25% 17.758us 0.740us 0.000us 0.00% 0.000us 0.000us 24 - aten::empty_like 0.18% 12.992us 1.03% 73.863us 8.207us 0.000us 0.00% 0.000us 0.000us 9 - aten::empty 1.22% 87.512us 1.22% 87.512us 4.167us 0.000us 0.00% 0.000us 0.000us 21 - cudaLaunchKernel 1.35% 96.951us 1.35% 96.951us 8.079us 0.000us 0.00% 0.000us 0.000us 12 - cudaStreamIsCapturing 0.03% 2.489us 0.03% 2.489us 0.830us 0.000us 0.00% 0.000us 0.000us 3 - cudaFuncSetAttribute 0.04% 3.130us 0.04% 3.130us 1.043us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 70.56% 5.071ms 70.56% 5.071ms 5.071ms 0.000us 0.00% 0.000us 0.000us 1 + torch_mem_eff 3.28% 242.746us 28.00% 2.075ms 2.075ms 0.000us 0.00% 5.933ms 5.933ms 1 + torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.886ms 100.14% 5.886ms 5.886ms 1 + aten::scaled_dot_product_attention 0.25% 18.240us 1.89% 140.073us 46.691us 0.000us 0.00% 5.241ms 1.747ms 3 + aten::_scaled_dot_product_efficient_attention 0.25% 18.689us 1.64% 121.833us 40.611us 0.000us 0.00% 5.241ms 1.747ms 3 + aten::_efficient_attention_forward 0.38% 28.462us 1.09% 81.063us 27.021us 5.241ms 89.17% 5.241ms 1.747ms 3 +fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.241ms 89.17% 5.241ms 1.747ms 3 + aten::contiguous 0.10% 7.041us 22.26% 1.650ms 183.285us 0.000us 0.00% 691.103us 76.789us 9 + aten::clone 0.29% 21.342us 22.17% 1.643ms 182.503us 0.000us 0.00% 691.103us 76.789us 9 + aten::copy_ 0.86% 63.451us 21.24% 1.574ms 174.872us 636.671us 10.83% 691.103us 76.789us 9 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 636.671us 10.83% 636.671us 70.741us 9 + Activity Buffer Request 19.50% 1.445ms 19.50% 1.445ms 1.445ms 54.432us 0.93% 54.432us 54.432us 1 + aten::transpose 0.64% 47.650us 0.87% 64.701us 2.696us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.23% 17.051us 0.23% 17.051us 0.710us 0.000us 0.00% 0.000us 0.000us 24 + aten::empty_like 0.16% 11.589us 0.64% 47.330us 5.259us 0.000us 0.00% 0.000us 0.000us 9 + aten::empty 0.82% 60.521us 0.82% 60.521us 2.882us 0.000us 0.00% 0.000us 0.000us 21 + cudaLaunchKernel 1.19% 88.044us 1.19% 88.044us 7.337us 0.000us 0.00% 0.000us 0.000us 12 + cudaStreamIsCapturing 0.03% 2.420us 0.03% 2.420us 0.807us 0.000us 0.00% 0.000us 0.000us 3 + cudaFuncSetAttribute 0.04% 3.030us 0.04% 3.030us 1.010us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 72.00% 5.335ms 72.00% 5.335ms 5.335ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 7.187ms -Self CUDA time total: 5.679ms +Self CPU time total: 7.410ms +Self CUDA time total: 5.878ms @@ -4221,28 +4221,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L320_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_mem_eff 3.31% 247.873us 28.16% 2.111ms 2.111ms 0.000us 0.00% 6.014ms 6.014ms 1 - torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.964ms 100.13% 5.964ms 5.964ms 1 - aten::scaled_dot_product_attention 0.26% 19.681us 1.94% 145.404us 48.468us 0.000us 0.00% 5.300ms 1.767ms 3 - aten::_scaled_dot_product_efficient_attention 0.25% 18.780us 1.68% 125.723us 41.908us 0.000us 0.00% 5.300ms 1.767ms 3 - aten::_efficient_attention_forward 0.40% 29.910us 1.12% 83.752us 27.917us 5.300ms 89.00% 5.300ms 1.767ms 3 -fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.300ms 89.00% 5.300ms 1.767ms 3 - aten::contiguous 0.10% 7.548us 22.32% 1.673ms 185.921us 0.000us 0.00% 713.444us 79.272us 9 - aten::clone 0.29% 21.851us 22.22% 1.666ms 185.082us 0.000us 0.00% 713.444us 79.272us 9 - aten::copy_ 0.89% 66.441us 21.22% 1.591ms 176.813us 655.331us 11.00% 713.444us 79.272us 9 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 655.331us 11.00% 655.331us 72.815us 9 - Activity Buffer Request 19.37% 1.452ms 19.37% 1.452ms 1.452ms 58.113us 0.98% 58.113us 58.113us 1 - aten::transpose 0.68% 50.773us 0.90% 67.843us 2.827us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.23% 17.070us 0.23% 17.070us 0.711us 0.000us 0.00% 0.000us 0.000us 24 - aten::empty_like 0.16% 12.290us 0.70% 52.570us 5.841us 0.000us 0.00% 0.000us 0.000us 9 - aten::empty 0.87% 64.980us 0.87% 64.980us 3.094us 0.000us 0.00% 0.000us 0.000us 21 - cudaLaunchKernel 1.28% 96.085us 1.28% 96.085us 8.007us 0.000us 0.00% 0.000us 0.000us 12 - cudaStreamIsCapturing 0.03% 2.520us 0.03% 2.520us 0.840us 0.000us 0.00% 0.000us 0.000us 3 - cudaFuncSetAttribute 0.04% 3.050us 0.04% 3.050us 1.017us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 71.84% 5.386ms 71.84% 5.386ms 5.386ms 0.000us 0.00% 0.000us 0.000us 1 + torch_mem_eff 3.21% 244.055us 27.47% 2.092ms 2.092ms 0.000us 0.00% 6.130ms 6.130ms 1 + torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.080ms 100.14% 6.080ms 6.080ms 1 + aten::scaled_dot_product_attention 0.23% 17.641us 1.86% 141.944us 47.315us 0.000us 0.00% 5.414ms 1.805ms 3 + aten::_scaled_dot_product_efficient_attention 0.25% 19.359us 1.63% 124.303us 41.434us 0.000us 0.00% 5.414ms 1.805ms 3 + aten::_efficient_attention_forward 0.37% 28.219us 1.06% 80.592us 26.864us 5.414ms 89.17% 5.414ms 1.805ms 3 +fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.414ms 89.17% 5.414ms 1.805ms 3 + aten::contiguous 0.11% 8.060us 21.81% 1.661ms 184.510us 0.000us 0.00% 716.192us 79.577us 9 + aten::clone 0.29% 22.431us 21.70% 1.653ms 183.615us 0.000us 0.00% 716.192us 79.577us 9 + aten::copy_ 0.81% 61.641us 20.75% 1.580ms 175.564us 657.728us 10.83% 716.192us 79.577us 9 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 657.728us 10.83% 657.728us 73.081us 9 + Activity Buffer Request 19.08% 1.453ms 19.08% 1.453ms 1.453ms 58.464us 0.96% 58.464us 58.464us 1 + aten::transpose 0.69% 52.203us 0.92% 69.763us 2.907us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.23% 17.560us 0.23% 17.560us 0.732us 0.000us 0.00% 0.000us 0.000us 24 + aten::empty_like 0.15% 11.581us 0.66% 50.023us 5.558us 0.000us 0.00% 0.000us 0.000us 9 + aten::empty 0.84% 63.785us 0.84% 63.785us 3.037us 0.000us 0.00% 0.000us 0.000us 21 + cudaLaunchKernel 1.14% 86.832us 1.14% 86.832us 7.236us 0.000us 0.00% 0.000us 0.000us 12 + cudaStreamIsCapturing 0.03% 2.250us 0.03% 2.250us 0.750us 0.000us 0.00% 0.000us 0.000us 3 + cudaFuncSetAttribute 0.04% 3.260us 0.04% 3.260us 1.087us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 72.53% 5.522ms 72.53% 5.522ms 5.522ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 7.498ms -Self CUDA time total: 5.956ms +Self CPU time total: 7.614ms +Self CUDA time total: 6.072ms @@ -4252,28 +4252,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L384_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_mem_eff 3.20% 247.803us 30.17% 2.338ms 2.338ms 0.000us 0.00% 6.050ms 6.050ms 1 - torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.000ms 100.13% 6.000ms 6.000ms 1 - aten::scaled_dot_product_attention 0.37% 28.670us 2.04% 158.093us 52.698us 0.000us 0.00% 5.339ms 1.780ms 3 - aten::_scaled_dot_product_efficient_attention 0.26% 20.220us 1.67% 129.423us 43.141us 0.000us 0.00% 5.339ms 1.780ms 3 - aten::_efficient_attention_forward 0.38% 29.560us 1.08% 83.863us 27.954us 5.339ms 89.10% 5.339ms 1.780ms 3 -fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.339ms 89.10% 5.339ms 1.780ms 3 - aten::contiguous 0.10% 7.610us 24.36% 1.887ms 209.722us 0.000us 0.00% 711.328us 79.036us 9 - aten::clone 0.28% 21.914us 24.26% 1.880ms 208.876us 0.000us 0.00% 711.328us 79.036us 9 - aten::copy_ 0.87% 67.261us 23.30% 1.806ms 200.640us 653.248us 10.90% 711.328us 79.036us 9 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 653.248us 10.90% 653.248us 72.583us 9 - Activity Buffer Request 18.39% 1.425ms 18.39% 1.425ms 1.425ms 58.080us 0.97% 58.080us 58.080us 1 - aten::transpose 0.68% 52.310us 0.90% 69.650us 2.902us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.22% 17.340us 0.22% 17.340us 0.723us 0.000us 0.00% 0.000us 0.000us 24 - aten::empty_like 0.16% 12.088us 0.67% 52.209us 5.801us 0.000us 0.00% 0.000us 0.000us 9 - aten::empty 0.84% 64.993us 0.84% 64.993us 3.095us 0.000us 0.00% 0.000us 0.000us 21 - cudaLaunchKernel 4.36% 337.546us 4.36% 337.546us 28.129us 0.000us 0.00% 0.000us 0.000us 12 - cudaStreamIsCapturing 0.03% 2.491us 0.03% 2.491us 0.830us 0.000us 0.00% 0.000us 0.000us 3 - cudaFuncSetAttribute 0.04% 3.020us 0.04% 3.020us 1.007us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 69.83% 5.411ms 69.83% 5.411ms 5.411ms 0.000us 0.00% 0.000us 0.000us 1 + torch_mem_eff 3.16% 248.365us 29.29% 2.300ms 2.300ms 0.000us 0.00% 6.163ms 6.163ms 1 + torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.114ms 100.14% 6.114ms 6.114ms 1 + aten::scaled_dot_product_attention 0.24% 19.232us 1.82% 142.774us 47.591us 0.000us 0.00% 5.452ms 1.817ms 3 + aten::_scaled_dot_product_efficient_attention 0.25% 19.461us 1.57% 123.542us 41.181us 0.000us 0.00% 5.452ms 1.817ms 3 + aten::_efficient_attention_forward 0.37% 29.029us 1.03% 80.672us 26.891us 5.452ms 89.29% 5.452ms 1.817ms 3 +fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.452ms 89.29% 5.452ms 1.817ms 3 + aten::contiguous 0.10% 7.931us 23.78% 1.867ms 207.435us 0.000us 0.00% 711.072us 79.008us 9 + aten::clone 0.30% 23.532us 23.68% 1.859ms 206.554us 0.000us 0.00% 711.072us 79.008us 9 + aten::copy_ 0.81% 63.779us 22.73% 1.785ms 198.306us 653.792us 10.71% 711.072us 79.008us 9 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 653.792us 10.71% 653.792us 72.644us 9 + Activity Buffer Request 18.59% 1.459ms 18.59% 1.459ms 1.459ms 57.280us 0.94% 57.280us 57.280us 1 + aten::transpose 0.62% 48.610us 0.83% 65.130us 2.714us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.21% 16.520us 0.21% 16.520us 0.688us 0.000us 0.00% 0.000us 0.000us 24 + aten::empty_like 0.16% 12.281us 0.65% 50.702us 5.634us 0.000us 0.00% 0.000us 0.000us 9 + aten::empty 0.80% 62.502us 0.80% 62.502us 2.976us 0.000us 0.00% 0.000us 0.000us 21 + cudaLaunchKernel 3.60% 282.729us 3.60% 282.729us 23.561us 0.000us 0.00% 0.000us 0.000us 12 + cudaStreamIsCapturing 0.03% 2.471us 0.03% 2.471us 0.824us 0.000us 0.00% 0.000us 0.000us 3 + cudaFuncSetAttribute 0.05% 4.120us 0.05% 4.120us 1.373us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 70.71% 5.551ms 70.71% 5.551ms 5.551ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 7.749ms -Self CUDA time total: 5.992ms +Self CPU time total: 7.851ms +Self CUDA time total: 6.106ms @@ -4283,28 +4283,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L448_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_mem_eff 3.22% 253.272us 29.03% 2.283ms 2.283ms 0.000us 0.00% 6.248ms 6.248ms 1 - torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.196ms 100.13% 6.196ms 6.196ms 1 - aten::scaled_dot_product_attention 0.25% 19.441us 2.25% 176.884us 58.961us 0.000us 0.00% 5.524ms 1.841ms 3 - aten::_scaled_dot_product_efficient_attention 0.26% 20.811us 2.00% 157.443us 52.481us 0.000us 0.00% 5.524ms 1.841ms 3 - aten::_efficient_attention_forward 0.41% 31.883us 1.42% 111.902us 37.301us 5.524ms 89.27% 5.524ms 1.841ms 3 -fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.524ms 89.27% 5.524ms 1.841ms 3 - aten::contiguous 0.10% 7.580us 22.97% 1.807ms 200.732us 0.000us 0.00% 724.035us 80.448us 9 - aten::clone 0.28% 22.150us 22.88% 1.799ms 199.890us 0.000us 0.00% 724.035us 80.448us 9 - aten::copy_ 0.85% 67.019us 21.94% 1.725ms 191.709us 664.226us 10.73% 724.035us 80.448us 9 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 664.226us 10.73% 664.226us 73.803us 9 - Activity Buffer Request 18.12% 1.425ms 18.12% 1.425ms 1.425ms 59.809us 0.97% 59.809us 59.809us 1 - aten::transpose 0.68% 53.201us 0.91% 71.182us 2.966us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.23% 17.981us 0.23% 17.981us 0.749us 0.000us 0.00% 0.000us 0.000us 24 - aten::empty_like 0.15% 12.001us 0.65% 51.482us 5.720us 0.000us 0.00% 0.000us 0.000us 9 - aten::empty 0.81% 63.729us 0.81% 63.729us 3.035us 0.000us 0.00% 0.000us 0.000us 21 - cudaLaunchKernel 3.60% 283.426us 3.60% 283.426us 23.619us 0.000us 0.00% 0.000us 0.000us 12 - cudaStreamIsCapturing 0.03% 2.490us 0.03% 2.490us 0.830us 0.000us 0.00% 0.000us 0.000us 3 - cudaFuncSetAttribute 0.04% 2.980us 0.04% 2.980us 0.993us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 70.97% 5.581ms 70.97% 5.581ms 5.581ms 0.000us 0.00% 0.000us 0.000us 1 + torch_mem_eff 3.01% 243.675us 28.03% 2.272ms 2.272ms 0.000us 0.00% 6.451ms 6.451ms 1 + torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.399ms 100.13% 6.399ms 6.399ms 1 + aten::scaled_dot_product_attention 0.23% 18.671us 1.77% 143.224us 47.741us 0.000us 0.00% 5.726ms 1.909ms 3 + aten::_scaled_dot_product_efficient_attention 0.24% 19.652us 1.54% 124.553us 41.518us 0.000us 0.00% 5.726ms 1.909ms 3 + aten::_efficient_attention_forward 0.35% 28.317us 0.99% 80.642us 26.881us 5.726ms 89.60% 5.726ms 1.909ms 3 +fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.726ms 89.60% 5.726ms 1.909ms 3 + aten::contiguous 0.10% 7.791us 22.70% 1.840ms 204.460us 0.000us 0.00% 725.025us 80.558us 9 + aten::clone 0.29% 23.489us 22.61% 1.832ms 203.594us 0.000us 0.00% 725.025us 80.558us 9 + aten::copy_ 0.81% 65.293us 21.68% 1.757ms 195.223us 664.641us 10.40% 725.025us 80.558us 9 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 664.641us 10.40% 664.641us 73.849us 9 + Activity Buffer Request 17.77% 1.440ms 17.77% 1.440ms 1.440ms 60.384us 0.94% 60.384us 60.384us 1 + aten::transpose 0.63% 51.151us 0.85% 69.251us 2.885us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.22% 18.100us 0.22% 18.100us 0.754us 0.000us 0.00% 0.000us 0.000us 24 + aten::empty_like 0.15% 11.960us 0.64% 51.852us 5.761us 0.000us 0.00% 0.000us 0.000us 9 + aten::empty 0.79% 64.314us 0.79% 64.314us 3.063us 0.000us 0.00% 0.000us 0.000us 21 + cudaLaunchKernel 3.36% 272.117us 3.36% 272.117us 22.676us 0.000us 0.00% 0.000us 0.000us 12 + cudaStreamIsCapturing 0.03% 2.500us 0.03% 2.500us 0.833us 0.000us 0.00% 0.000us 0.000us 3 + cudaFuncSetAttribute 0.06% 4.532us 0.06% 4.532us 1.511us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 71.97% 5.833ms 71.97% 5.833ms 5.833ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 7.864ms -Self CUDA time total: 6.188ms +Self CPU time total: 8.105ms +Self CUDA time total: 6.391ms @@ -4314,37 +4314,37 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L512_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_mem_eff 3.10% 256.636us 27.41% 2.272ms 2.272ms 0.000us 0.00% 6.685ms 6.685ms 1 - torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.632ms 100.12% 6.632ms 6.632ms 1 - aten::scaled_dot_product_attention 0.23% 18.791us 1.80% 149.483us 49.828us 0.000us 0.00% 5.954ms 1.985ms 3 - aten::_scaled_dot_product_efficient_attention 0.24% 19.642us 1.58% 130.692us 43.564us 0.000us 0.00% 5.954ms 1.985ms 3 - aten::_efficient_attention_forward 0.40% 33.027us 1.05% 86.901us 28.967us 5.954ms 89.88% 5.954ms 1.985ms 3 -fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.954ms 89.88% 5.954ms 1.985ms 3 - aten::contiguous 0.09% 7.531us 21.68% 1.797ms 199.660us 0.000us 0.00% 731.136us 81.237us 9 - aten::clone 0.27% 22.649us 21.59% 1.789ms 198.823us 0.000us 0.00% 731.136us 81.237us 9 - aten::copy_ 0.82% 67.700us 20.66% 1.712ms 190.261us 670.176us 10.12% 731.136us 81.237us 9 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 670.176us 10.12% 670.176us 74.464us 9 - Activity Buffer Request 17.30% 1.434ms 17.30% 1.434ms 1.434ms 60.960us 0.92% 60.960us 60.960us 1 - aten::transpose 0.90% 75.001us 1.12% 92.890us 3.870us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.22% 17.889us 0.22% 17.889us 0.745us 0.000us 0.00% 0.000us 0.000us 24 - aten::empty_like 0.15% 12.259us 0.66% 54.410us 6.046us 0.000us 0.00% 0.000us 0.000us 9 - aten::empty 0.81% 67.133us 0.81% 67.133us 3.197us 0.000us 0.00% 0.000us 0.000us 21 - cudaLaunchKernel 2.82% 234.057us 2.82% 234.057us 19.505us 0.000us 0.00% 0.000us 0.000us 12 - cudaStreamIsCapturing 0.03% 2.420us 0.03% 2.420us 0.807us 0.000us 0.00% 0.000us 0.000us 3 - cudaFuncSetAttribute 0.04% 3.430us 0.04% 3.430us 1.143us 0.000us 0.00% 0.000us 0.000us 3 - cudaDeviceSynchronize 72.59% 6.017ms 72.59% 6.017ms 6.017ms 0.000us 0.00% 0.000us 0.000us 1 + torch_mem_eff 2.88% 242.135us 27.00% 2.269ms 2.269ms 0.000us 0.00% 6.759ms 6.759ms 1 + torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.705ms 100.12% 6.705ms 6.705ms 1 + aten::scaled_dot_product_attention 0.21% 17.851us 1.72% 144.884us 48.295us 0.000us 0.00% 6.024ms 2.008ms 3 + aten::_scaled_dot_product_efficient_attention 0.23% 19.591us 1.51% 127.033us 42.344us 0.000us 0.00% 6.024ms 2.008ms 3 + aten::_efficient_attention_forward 0.34% 28.520us 0.97% 81.532us 27.177us 6.024ms 89.96% 6.024ms 2.008ms 3 +fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 6.024ms 89.96% 6.024ms 2.008ms 3 + aten::contiguous 0.10% 8.099us 21.87% 1.838ms 204.242us 0.000us 0.00% 734.178us 81.575us 9 + aten::clone 0.28% 23.122us 21.78% 1.830ms 203.342us 0.000us 0.00% 734.178us 81.575us 9 + aten::copy_ 0.74% 62.180us 20.86% 1.753ms 194.799us 672.322us 10.04% 734.178us 81.575us 9 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 672.322us 10.04% 672.322us 74.702us 9 + Activity Buffer Request 17.19% 1.445ms 17.19% 1.445ms 1.445ms 61.856us 0.92% 61.856us 61.856us 1 + aten::transpose 0.62% 52.351us 0.83% 70.022us 2.918us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.21% 17.671us 0.21% 17.671us 0.736us 0.000us 0.00% 0.000us 0.000us 24 + aten::empty_like 0.15% 12.653us 0.64% 53.763us 5.974us 0.000us 0.00% 0.000us 0.000us 9 + aten::empty 0.79% 66.761us 0.79% 66.761us 3.179us 0.000us 0.00% 0.000us 0.000us 21 + cudaLaunchKernel 3.19% 267.907us 3.19% 267.907us 22.326us 0.000us 0.00% 0.000us 0.000us 12 + cudaStreamIsCapturing 0.03% 2.430us 0.03% 2.430us 0.810us 0.000us 0.00% 0.000us 0.000us 3 + cudaFuncSetAttribute 0.04% 3.350us 0.04% 3.350us 1.117us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 73.00% 6.134ms 73.00% 6.134ms 6.134ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 8.289ms -Self CUDA time total: 6.624ms +Self CPU time total: 8.404ms +Self CUDA time total: 6.697ms impl wl p50(ms) ok -torch_mem_eff cuda_attn_L128_bfloat16 1.81 True -torch_mem_eff cuda_attn_L256_bfloat16 1.88 True -torch_mem_eff cuda_attn_L320_bfloat16 1.97 True -torch_mem_eff cuda_attn_L384_bfloat16 1.97 True -torch_mem_eff cuda_attn_L448_bfloat16 2.09 True -torch_mem_eff cuda_attn_L512_bfloat16 2.22 True +torch_mem_eff cuda_attn_L128_bfloat16 1.85 True +torch_mem_eff cuda_attn_L256_bfloat16 1.95 True +torch_mem_eff cuda_attn_L320_bfloat16 1.99 True +torch_mem_eff cuda_attn_L384_bfloat16 2.07 True +torch_mem_eff cuda_attn_L448_bfloat16 2.06 True +torch_mem_eff cuda_attn_L512_bfloat16 2.25 True

Artifacts:

diff --git a/flash_attn/impls/sage_attention.html b/flash_attn/impls/sage_attention.html index fce8d8891e35f4da7c7b93129ab9c68bf413d0a6..c964f0f922939bcdffdf70f7e986e24de2938dac 100644 --- a/flash_attn/impls/sage_attention.html +++ b/flash_attn/impls/sage_attention.html @@ -4104,13 +4104,14 @@ body[data-tool="eraser"] .main-content { ▼ code ▼ output - ▶ uv-logs + ▶ uv-logs | -Cell: benchmark | 4.12s +Cell: benchmark | 4.69s | Raw -GitHub +GitHub +🤗 HF
@@ -4155,24 +4156,27 @@ Cell: benchmark | 4.12s
Running attention benchmark on cuda with 6 workloads.
 impl                     wl                  p50(ms)  ok
 sage_int8_fp16           cuda_attn_L128_bfloat16    FAIL  False
-  Error: module 'sage_attention_cb34d81dafacbad9' has no attribute 'fwd'
+  Error: module 'sage_attention_ef0573391bb63704' has no attribute 'fwd'
 sage_int8_fp16           cuda_attn_L256_bfloat16    FAIL  False
-  Error: module 'sage_attention_cb34d81dafacbad9' has no attribute 'fwd'
+  Error: module 'sage_attention_ef0573391bb63704' has no attribute 'fwd'
 sage_int8_fp16           cuda_attn_L320_bfloat16    FAIL  False
-  Error: module 'sage_attention_cb34d81dafacbad9' has no attribute 'fwd'
+  Error: module 'sage_attention_ef0573391bb63704' has no attribute 'fwd'
 sage_int8_fp16           cuda_attn_L384_bfloat16    FAIL  False
-  Error: module 'sage_attention_cb34d81dafacbad9' has no attribute 'fwd'
+  Error: module 'sage_attention_ef0573391bb63704' has no attribute 'fwd'
 sage_int8_fp16           cuda_attn_L448_bfloat16    FAIL  False
-  Error: module 'sage_attention_cb34d81dafacbad9' has no attribute 'fwd'
+  Error: module 'sage_attention_ef0573391bb63704' has no attribute 'fwd'
 sage_int8_fp16           cuda_attn_L512_bfloat16    FAIL  False
-  Error: module 'sage_attention_cb34d81dafacbad9' has no attribute 'fwd'
+  Error: module 'sage_attention_ef0573391bb63704' has no attribute 'fwd'
 
-
-Fetching 11 files: 0%| | 0/11 [00:00<?, ?it/s] -Fetching 11 files: 18%|█▊ | 2/11 [00:00<00:00, 17.35it/s] -Fetching 11 files: 73%|███████▎ | 8/11 [00:00<00:00, 15.18it/s] -Fetching 11 files: 100%|██████████| 11/11 [00:00<00:00, 21.06it/s] +
+
▶ UV Install Logs
+ +
+
Fetching 11 files: 0%| | 0/11 [00:00<?, ?it/s] +Fetching 11 files: 73%|███████▎ | 8/11 [00:00<00:00, 11.73it/s] +Fetching 11 files: 100%|██████████| 11/11 [00:00<00:00, 16.12it/s]

Artifacts:

attention.jsonl diff --git a/flash_attn/impls/xformers.html b/flash_attn/impls/xformers.html index e1ecdb582681c3ec96cb0b0c54cc3f176cd9f9eb..3e1c781413a91f403396426a1c99ea9ec7673187 100644 --- a/flash_attn/impls/xformers.html +++ b/flash_attn/impls/xformers.html @@ -4106,11 +4106,11 @@ body[data-tool="eraser"] .main-content { ▼ output ▶ uv-logs | -Cell: benchmark | 5.04s +Cell: benchmark | 33.71s | Raw -GitHub +GitHub
@@ -4158,21 +4158,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L128_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - xformers_meff 11.46% 506.438us 53.66% 2.372ms 2.372ms 0.000us 0.00% 3.500ms 3.500ms 1 - xformers_flash3::flash_fwd 4.48% 198.083us 41.44% 1.831ms 610.487us 0.000us 0.00% 3.500ms 1.167ms 3 - flash_attn_3::fwd 1.73% 76.649us 36.96% 1.633ms 544.459us 2.610ms 100.00% 3.500ms 1.167ms 3 - xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.612ms 100.06% 2.612ms 2.612ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.610ms 100.00% 2.610ms 870.154us 3 - Activity Buffer Request 33.26% 1.470ms 33.26% 1.470ms 1.470ms 889.248us 34.06% 889.248us 889.248us 1 - aten::empty 0.80% 35.182us 0.80% 35.182us 5.864us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.25% 10.920us 0.25% 10.920us 3.640us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 0.92% 40.501us 0.92% 40.501us 13.500us 0.000us 0.00% 0.000us 0.000us 3 - aten::reshape 0.27% 12.132us 0.77% 33.872us 5.645us 0.000us 0.00% 0.000us 0.000us 6 - aten::view 0.49% 21.740us 0.49% 21.740us 3.623us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 46.34% 2.048ms 46.34% 2.048ms 2.048ms 0.000us 0.00% 0.000us 0.000us 1 + xformers_meff 10.98% 488.134us 52.82% 2.349ms 2.349ms 0.000us 0.00% 3.539ms 3.539ms 1 + xformers_flash3::flash_fwd 4.45% 198.034us 41.02% 1.824ms 608.009us 0.000us 0.00% 3.539ms 1.180ms 3 + flash_attn_3::fwd 1.81% 80.354us 36.57% 1.626ms 541.997us 2.647ms 100.00% 3.539ms 1.180ms 3 + xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.648ms 100.06% 2.648ms 2.648ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.647ms 100.00% 2.647ms 882.203us 3 + Activity Buffer Request 32.65% 1.452ms 32.65% 1.452ms 1.452ms 892.891us 33.74% 892.891us 892.891us 1 + aten::empty 0.78% 34.470us 0.78% 34.470us 5.745us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.26% 11.370us 0.26% 11.370us 3.790us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 1.08% 47.851us 1.08% 47.851us 15.950us 0.000us 0.00% 0.000us 0.000us 3 + aten::reshape 0.28% 12.261us 0.82% 36.420us 6.070us 0.000us 0.00% 0.000us 0.000us 6 + aten::view 0.54% 24.159us 0.54% 24.159us 4.026us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 47.18% 2.098ms 47.18% 2.098ms 2.098ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.420ms -Self CUDA time total: 2.610ms +Self CPU time total: 4.447ms +Self CUDA time total: 2.647ms @@ -4182,21 +4182,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L256_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - xformers_meff 7.25% 318.297us 46.47% 2.042ms 2.042ms 0.000us 0.00% 3.722ms 3.722ms 1 - xformers_flash3::flash_fwd 3.37% 148.131us 38.68% 1.699ms 566.453us 0.000us 0.00% 3.722ms 1.241ms 3 - flash_attn_3::fwd 1.17% 51.450us 35.31% 1.551ms 517.076us 2.780ms 100.00% 3.722ms 1.241ms 3 - xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.782ms 100.05% 2.782ms 2.782ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.780ms 100.00% 2.780ms 926.692us 3 - Activity Buffer Request 32.58% 1.431ms 32.58% 1.431ms 1.431ms 942.244us 33.89% 942.244us 942.244us 1 - aten::empty 0.66% 29.210us 0.66% 29.210us 4.868us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.13% 5.512us 0.13% 5.512us 1.837us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 0.77% 34.031us 0.77% 34.031us 11.344us 0.000us 0.00% 0.000us 0.000us 3 - aten::reshape 0.21% 9.369us 0.54% 23.900us 3.983us 0.000us 0.00% 0.000us 0.000us 6 - aten::view 0.33% 14.531us 0.33% 14.531us 2.422us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 53.53% 2.351ms 53.53% 2.351ms 2.351ms 0.000us 0.00% 0.000us 0.000us 1 + xformers_meff 7.22% 318.208us 46.97% 2.070ms 2.070ms 0.000us 0.00% 3.700ms 3.700ms 1 + xformers_flash3::flash_fwd 3.33% 146.973us 39.20% 1.728ms 575.898us 0.000us 0.00% 3.700ms 1.233ms 3 + flash_attn_3::fwd 1.20% 53.004us 35.87% 1.581ms 526.907us 2.767ms 100.00% 3.700ms 1.233ms 3 + xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.769ms 100.05% 2.769ms 2.769ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.767ms 100.00% 2.767ms 922.499us 3 + Activity Buffer Request 33.12% 1.459ms 33.12% 1.459ms 1.459ms 932.857us 33.71% 932.857us 932.857us 1 + aten::empty 0.65% 28.790us 0.65% 28.790us 4.798us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.13% 5.860us 0.13% 5.860us 1.953us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.76% 33.580us 0.76% 33.580us 11.193us 0.000us 0.00% 0.000us 0.000us 3 + aten::reshape 0.21% 9.291us 0.54% 23.901us 3.983us 0.000us 0.00% 0.000us 0.000us 6 + aten::view 0.33% 14.610us 0.33% 14.610us 2.435us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 53.03% 2.337ms 53.03% 2.337ms 2.337ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.393ms -Self CUDA time total: 2.780ms +Self CPU time total: 4.407ms +Self CUDA time total: 2.767ms @@ -4206,21 +4206,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L320_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - xformers_meff 6.91% 309.504us 45.24% 2.025ms 2.025ms 0.000us 0.00% 3.854ms 3.854ms 1 - xformers_flash3::flash_fwd 3.30% 147.756us 37.80% 1.692ms 563.990us 0.000us 0.00% 3.854ms 1.285ms 3 - flash_attn_3::fwd 1.19% 53.048us 34.50% 1.544ms 514.738us 2.875ms 100.00% 3.854ms 1.285ms 3 - xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.877ms 100.05% 2.877ms 2.877ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.875ms 100.00% 2.875ms 958.381us 3 - Activity Buffer Request 31.77% 1.422ms 31.77% 1.422ms 1.422ms 979.266us 34.06% 979.266us 979.266us 1 - aten::empty 0.67% 29.790us 0.67% 29.790us 4.965us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.12% 5.570us 0.12% 5.570us 1.857us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 0.76% 33.852us 0.76% 33.852us 11.284us 0.000us 0.00% 0.000us 0.000us 3 - aten::reshape 0.22% 9.920us 0.53% 23.660us 3.943us 0.000us 0.00% 0.000us 0.000us 6 - aten::view 0.31% 13.740us 0.31% 13.740us 2.290us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 54.76% 2.451ms 54.76% 2.451ms 2.451ms 0.000us 0.00% 0.000us 0.000us 1 + xformers_meff 6.87% 306.279us 45.67% 2.036ms 2.036ms 0.000us 0.00% 3.803ms 3.803ms 1 + xformers_flash3::flash_fwd 3.28% 146.193us 38.29% 1.707ms 568.871us 0.000us 0.00% 3.803ms 1.268ms 3 + flash_attn_3::fwd 1.22% 54.360us 35.01% 1.560ms 520.140us 2.841ms 100.00% 3.803ms 1.268ms 3 + xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.843ms 100.05% 2.843ms 2.843ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.841ms 100.00% 2.841ms 947.064us 3 + Activity Buffer Request 32.21% 1.435ms 32.21% 1.435ms 1.435ms 961.848us 33.85% 961.848us 961.848us 1 + aten::empty 0.68% 30.200us 0.68% 30.200us 5.033us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.12% 5.560us 0.12% 5.560us 1.853us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.78% 34.863us 0.78% 34.863us 11.621us 0.000us 0.00% 0.000us 0.000us 3 + aten::reshape 0.20% 8.808us 0.51% 22.610us 3.768us 0.000us 0.00% 0.000us 0.000us 6 + aten::view 0.31% 13.802us 0.31% 13.802us 2.300us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 54.33% 2.422ms 54.33% 2.422ms 2.422ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.476ms -Self CUDA time total: 2.875ms +Self CPU time total: 4.457ms +Self CUDA time total: 2.841ms @@ -4230,21 +4230,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L384_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - xformers_meff 6.53% 306.895us 47.96% 2.255ms 2.255ms 0.000us 0.00% 3.838ms 3.838ms 1 - xformers_flash3::flash_fwd 3.09% 145.243us 40.94% 1.925ms 641.651us 0.000us 0.00% 3.838ms 1.279ms 3 - flash_attn_3::fwd 1.17% 55.062us 37.85% 1.780ms 593.237us 2.865ms 100.00% 3.838ms 1.279ms 3 - xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.866ms 100.05% 2.866ms 2.866ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.865ms 100.00% 2.865ms 954.931us 3 - Activity Buffer Request 30.23% 1.421ms 30.23% 1.421ms 1.421ms 973.182us 33.97% 973.182us 973.182us 1 - aten::empty 0.63% 29.790us 0.63% 29.790us 4.965us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.11% 5.390us 0.11% 5.390us 1.797us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 5.70% 268.094us 5.70% 268.094us 89.365us 0.000us 0.00% 0.000us 0.000us 3 - aten::reshape 0.19% 8.710us 0.49% 22.930us 3.822us 0.000us 0.00% 0.000us 0.000us 6 - aten::view 0.30% 14.220us 0.30% 14.220us 2.370us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 52.04% 2.447ms 52.04% 2.447ms 2.447ms 0.000us 0.00% 0.000us 0.000us 1 + xformers_meff 6.67% 311.798us 48.16% 2.253ms 2.253ms 0.000us 0.00% 3.854ms 3.854ms 1 + xformers_flash3::flash_fwd 3.68% 172.144us 40.98% 1.917ms 638.949us 0.000us 0.00% 3.854ms 1.285ms 3 + flash_attn_3::fwd 1.19% 55.670us 37.30% 1.745ms 581.568us 2.881ms 100.00% 3.854ms 1.285ms 3 + xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.883ms 100.05% 2.883ms 2.883ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.881ms 100.00% 2.881ms 960.465us 3 + Activity Buffer Request 30.77% 1.440ms 30.77% 1.440ms 1.440ms 972.603us 33.75% 972.603us 972.603us 1 + aten::empty 0.63% 29.580us 0.63% 29.580us 4.930us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.12% 5.801us 0.12% 5.801us 1.934us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 4.58% 214.036us 4.58% 214.036us 71.345us 0.000us 0.00% 0.000us 0.000us 3 + aten::reshape 0.19% 9.019us 0.51% 24.051us 4.009us 0.000us 0.00% 0.000us 0.000us 6 + aten::view 0.32% 15.032us 0.32% 15.032us 2.505us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 51.84% 2.425ms 51.84% 2.425ms 2.425ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 4.702ms -Self CUDA time total: 2.865ms +Self CPU time total: 4.678ms +Self CUDA time total: 2.881ms @@ -4254,21 +4254,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L448_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - xformers_meff 6.46% 328.735us 43.31% 2.206ms 2.206ms 0.000us 0.00% 4.477ms 4.477ms 1 - xformers_flash3::flash_fwd 3.06% 155.642us 36.36% 1.852ms 617.231us 0.000us 0.00% 4.477ms 1.492ms 3 - flash_attn_3::fwd 1.12% 56.881us 33.30% 1.696ms 565.350us 3.348ms 100.00% 4.477ms 1.492ms 3 - xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.349ms 100.04% 3.349ms 3.349ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.348ms 100.00% 3.348ms 1.116ms 3 - Activity Buffer Request 27.91% 1.421ms 27.91% 1.421ms 1.421ms 1.129ms 33.72% 1.129ms 1.129ms 1 - aten::empty 0.63% 32.251us 0.63% 32.251us 5.375us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.11% 5.740us 0.11% 5.740us 1.913us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 3.53% 179.913us 3.53% 179.913us 59.971us 0.000us 0.00% 0.000us 0.000us 3 - aten::reshape 0.21% 10.692us 0.50% 25.231us 4.205us 0.000us 0.00% 0.000us 0.000us 6 - aten::view 0.29% 14.539us 0.29% 14.539us 2.423us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 56.69% 2.887ms 56.69% 2.887ms 2.887ms 0.000us 0.00% 0.000us 0.000us 1 + xformers_meff 5.88% 304.576us 42.22% 2.188ms 2.188ms 0.000us 0.00% 4.552ms 4.552ms 1 + xformers_flash3::flash_fwd 2.84% 147.154us 35.91% 1.861ms 620.213us 0.000us 0.00% 4.552ms 1.517ms 3 + flash_attn_3::fwd 1.02% 52.961us 33.07% 1.713ms 571.161us 3.412ms 100.00% 4.552ms 1.517ms 3 + xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.414ms 100.04% 3.414ms 3.414ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.412ms 100.00% 3.412ms 1.137ms 3 + Activity Buffer Request 27.95% 1.448ms 27.95% 1.448ms 1.448ms 1.140ms 33.41% 1.140ms 1.140ms 1 + aten::empty 0.56% 29.272us 0.56% 29.272us 4.879us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.12% 6.180us 0.12% 6.180us 2.060us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 3.41% 176.624us 3.41% 176.624us 58.875us 0.000us 0.00% 0.000us 0.000us 3 + aten::reshape 0.17% 9.052us 0.44% 22.882us 3.814us 0.000us 0.00% 0.000us 0.000us 6 + aten::view 0.27% 13.830us 0.27% 13.830us 2.305us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 57.78% 2.994ms 57.78% 2.994ms 2.994ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 5.092ms -Self CUDA time total: 3.348ms +Self CPU time total: 5.182ms +Self CUDA time total: 3.412ms @@ -4278,37 +4278,83 @@ PROFILE TRACE: xformers_meff | cuda_attn_L512_bfloat16 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - xformers_meff 6.24% 320.533us 43.45% 2.233ms 2.233ms 0.000us 0.00% 4.496ms 4.496ms 1 - xformers_flash3::flash_fwd 2.90% 149.124us 36.73% 1.887ms 629.094us 0.000us 0.00% 4.496ms 1.499ms 3 - flash_attn_3::fwd 1.48% 76.290us 33.83% 1.738ms 579.386us 3.368ms 100.00% 4.496ms 1.499ms 3 - xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.369ms 100.05% 3.369ms 3.369ms 1 -void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.368ms 100.00% 3.368ms 1.123ms 3 - Activity Buffer Request 28.33% 1.456ms 28.33% 1.456ms 1.456ms 1.129ms 33.51% 1.129ms 1.129ms 1 - aten::empty 0.58% 29.962us 0.58% 29.962us 4.994us 0.000us 0.00% 0.000us 0.000us 6 - cudaFuncSetAttribute 0.12% 6.240us 0.12% 6.240us 2.080us 0.000us 0.00% 0.000us 0.000us 3 - cudaLaunchKernel 3.31% 169.832us 3.31% 169.832us 56.611us 0.000us 0.00% 0.000us 0.000us 3 - aten::reshape 0.21% 10.672us 0.48% 24.873us 4.146us 0.000us 0.00% 0.000us 0.000us 6 - aten::view 0.28% 14.201us 0.28% 14.201us 2.367us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 56.55% 2.906ms 56.55% 2.906ms 2.906ms 0.000us 0.00% 0.000us 0.000us 1 + xformers_meff 5.58% 285.697us 41.87% 2.143ms 2.143ms 0.000us 0.00% 4.544ms 4.544ms 1 + xformers_flash3::flash_fwd 2.91% 148.714us 35.83% 1.834ms 611.255us 0.000us 0.00% 4.544ms 1.515ms 3 + flash_attn_3::fwd 1.04% 53.311us 32.92% 1.685ms 561.684us 3.402ms 100.00% 4.544ms 1.515ms 3 + xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.403ms 100.05% 3.403ms 3.403ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.402ms 100.00% 3.402ms 1.134ms 3 + Activity Buffer Request 27.78% 1.422ms 27.78% 1.422ms 1.422ms 1.142ms 33.57% 1.142ms 1.142ms 1 + aten::empty 0.58% 29.640us 0.58% 29.640us 4.940us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.12% 5.990us 0.12% 5.990us 1.997us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 3.40% 174.134us 3.40% 174.134us 58.045us 0.000us 0.00% 0.000us 0.000us 3 + aten::reshape 0.17% 8.543us 0.45% 23.191us 3.865us 0.000us 0.00% 0.000us 0.000us 6 + aten::view 0.29% 14.648us 0.29% 14.648us 2.441us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 58.13% 2.975ms 58.13% 2.975ms 2.975ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 5.138ms -Self CUDA time total: 3.368ms +Self CPU time total: 5.118ms +Self CUDA time total: 3.402ms impl wl p50(ms) ok -xformers_meff cuda_attn_L128_bfloat16 0.98 True -xformers_meff cuda_attn_L256_bfloat16 1.02 True -xformers_meff cuda_attn_L320_bfloat16 1.07 True +xformers_meff cuda_attn_L128_bfloat16 1.00 True +xformers_meff cuda_attn_L256_bfloat16 1.03 True +xformers_meff cuda_attn_L320_bfloat16 1.08 True xformers_meff cuda_attn_L384_bfloat16 1.08 True -xformers_meff cuda_attn_L448_bfloat16 1.24 True +xformers_meff cuda_attn_L448_bfloat16 1.25 True xformers_meff cuda_attn_L512_bfloat16 1.23 True
▶ UV Install Logs
diff --git a/flash_attn/results/artifacts/combine/latency.svg b/flash_attn/results/artifacts/combine/latency.svg index 0f51d77bf35af08b6174bc4df17db6fe30a4e491..31d30c5dcfa68f4fc35593a1422ddd982b5374d8 100644 --- a/flash_attn/results/artifacts/combine/latency.svg +++ b/flash_attn/results/artifacts/combine/latency.svg @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c6390d15c17c1cced5612c62eb1fb07f7304765d3d9c2c842f634fd3107bbeaf -size 24786 +oid sha256:520b28a43c879f6952cf0ddeade1438dbb5bd7caf01b6509254a4c68e9446ee6 +size 24783 diff --git a/flash_attn/results/combined_results.html b/flash_attn/results/combined_results.html index dbe50dede3b447c779732c2f39dd59bfd2928e4f..0682107b1540718d4e870417450dee78797760de 100644 --- a/flash_attn/results/combined_results.html +++ b/flash_attn/results/combined_results.html @@ -4107,7 +4107,7 @@ body[data-tool="eraser"] .main-content { - 2025-10-30T15:53:53.940454 + 2025-10-31T20:14:18.946177 image/svg+xml @@ -4217,96 +4217,96 @@ body[data-tool="eraser"] .main-content { - + - + - 1.0 + 1.0 - + - + - 1.2 + 1.2 - + - + - 1.4 + 1.4 - + - + - 1.6 + 1.6 - + - + - 1.8 + 1.8 - + - + - 2.0 + 2.0 - + - + - 2.2 + 2.2 @@ -4314,73 +4314,73 @@ body[data-tool="eraser"] .main-content { - + - - - - - - + + + + + + - + - - - - - + + + + + - + - - - - - - + + + + + + - + - - - - - - + + + + + + - + - - - - - + + + + + @@ -4465,7 +4465,7 @@ body[data-tool="eraser"] .main-content { ▼ output ▶ uv-logs | -Cell: combine | 4.26s +Cell: combine | 4.31s | Raw @@ -4572,47 +4572,47 @@ Summary: 6 found, 0 skipped, 0 missing COMBINED BENCHMARK SUMMARY impl wl p50(ms) ok -hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.94 True -hf_kernels_flash_attn cuda_attn_L256_bfloat16 0.99 True -hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.03 True -hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.05 True -hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.21 True +hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.95 True +hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.00 True +hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.05 True +hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.06 True +hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.23 True hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.23 True -hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.91 True -hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.95 True -hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.02 True -hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.02 True -hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.18 True +hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.92 True +hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.96 True +hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.01 True +hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.03 True +hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.20 True hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.18 True sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False - Error: module 'sage_attention_cb34d81dafacbad9' has no attribute 'fwd' + Error: module 'sage_attention_ef0573391bb63704' has no attribute 'fwd' sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False - Error: module 'sage_attention_cb34d81dafacbad9' has no attribute 'fwd' + Error: module 'sage_attention_ef0573391bb63704' has no attribute 'fwd' sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False - Error: module 'sage_attention_cb34d81dafacbad9' has no attribute 'fwd' + Error: module 'sage_attention_ef0573391bb63704' has no attribute 'fwd' sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False - Error: module 'sage_attention_cb34d81dafacbad9' has no attribute 'fwd' + Error: module 'sage_attention_ef0573391bb63704' has no attribute 'fwd' sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False - Error: module 'sage_attention_cb34d81dafacbad9' has no attribute 'fwd' + Error: module 'sage_attention_ef0573391bb63704' has no attribute 'fwd' sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False - Error: module 'sage_attention_cb34d81dafacbad9' has no attribute 'fwd' + Error: module 'sage_attention_ef0573391bb63704' has no attribute 'fwd' torch_flash_ma cuda_attn_L128_bfloat16 1.22 True -torch_flash_ma cuda_attn_L256_bfloat16 1.27 True -torch_flash_ma cuda_attn_L320_bfloat16 1.29 True -torch_flash_ma cuda_attn_L384_bfloat16 1.30 True -torch_flash_ma cuda_attn_L448_bfloat16 1.45 True -torch_flash_ma cuda_attn_L512_bfloat16 1.49 True -torch_mem_eff cuda_attn_L128_bfloat16 1.81 True -torch_mem_eff cuda_attn_L256_bfloat16 1.88 True -torch_mem_eff cuda_attn_L320_bfloat16 1.97 True -torch_mem_eff cuda_attn_L384_bfloat16 1.97 True -torch_mem_eff cuda_attn_L448_bfloat16 2.09 True -torch_mem_eff cuda_attn_L512_bfloat16 2.22 True -xformers_meff cuda_attn_L128_bfloat16 0.98 True -xformers_meff cuda_attn_L256_bfloat16 1.02 True -xformers_meff cuda_attn_L320_bfloat16 1.07 True +torch_flash_ma cuda_attn_L256_bfloat16 1.28 True +torch_flash_ma cuda_attn_L320_bfloat16 1.30 True +torch_flash_ma cuda_attn_L384_bfloat16 1.33 True +torch_flash_ma cuda_attn_L448_bfloat16 1.50 True +torch_flash_ma cuda_attn_L512_bfloat16 1.51 True +torch_mem_eff cuda_attn_L128_bfloat16 1.85 True +torch_mem_eff cuda_attn_L256_bfloat16 1.95 True +torch_mem_eff cuda_attn_L320_bfloat16 1.99 True +torch_mem_eff cuda_attn_L384_bfloat16 2.07 True +torch_mem_eff cuda_attn_L448_bfloat16 2.06 True +torch_mem_eff cuda_attn_L512_bfloat16 2.25 True +xformers_meff cuda_attn_L128_bfloat16 1.00 True +xformers_meff cuda_attn_L256_bfloat16 1.03 True +xformers_meff cuda_attn_L320_bfloat16 1.08 True xformers_meff cuda_attn_L384_bfloat16 1.08 True -xformers_meff cuda_attn_L448_bfloat16 1.24 True +xformers_meff cuda_attn_L448_bfloat16 1.25 True xformers_meff cuda_attn_L512_bfloat16 1.23 True GENERATING COMBINED VISUALIZATION @@ -4637,7 +4637,7 @@ Implementations included:
▶ UV Install Logs
@@ -4650,7 +4650,7 @@ Installed 37 packages in 190ms - 2025-10-30T15:53:53.940454 + 2025-10-31T20:14:18.946177 image/svg+xml @@ -4760,96 +4760,96 @@ Installed 37 packages in 190ms - + - + - 1.0 + 1.0 - + - + - 1.2 + 1.2 - + - + - 1.4 + 1.4 - + - + - 1.6 + 1.6 - + - + - 1.8 + 1.8 - + - + - 2.0 + 2.0 - + - + - 2.2 + 2.2 @@ -4857,73 +4857,73 @@ Installed 37 packages in 190ms - + - - - - - - + + + + + + - + - - - - - + + + + + - + - - - - - - + + + + + + - + - - - - - - + + + + + + - + - - - - - + + + + + diff --git a/index.html b/index.html index 6d43c0f1f0a8b42c583cb3e0d6a059916ac01ccc..11cdf1eef85f4dda68d9e978af612e8aae0078bb 100644 --- a/index.html +++ b/index.html @@ -4097,35 +4097,54 @@ body[data-tool="eraser"] .main-content {
-

KERNELS COMMUNITY BENCHMARKS

+ + +


+

KERNELS COMMUNITY BENCHMARKS

This report aggregates latency and performance benchmarks across core model components.
Each section includes:
- A latency visualization
- Links to detailed implementation benchmarks

TABLE OF CONTENTS

+

RUN YOURSELF

+

To run the benchmarks locally, clone the repository and use uvx to build and run the benchmarks:

+

Note benches are made to run on a machine with a compatible NVIDIA GPU and CUDA installed, other hardware may not not work as expected.

+
git clone https://github.com/huggingface/kernels-benchmarks.git
+cd kernels-benchmarks
+uvx https://github.com/drbh/uvnote.git build benches
+
+

METHODOLOGY

-

Each benchmark is run with the Kernels Benchmarking Framework and follows these principles:
+

Each benchmark is run with the +Kernels Benchmarking Framework and follows these principles:
- a reference implementation (usually PyTorch native) is included for baseline comparison
- multiple input sizes and batch sizes are tested to reflect real-world usage
- runs are repeatable via python virtual environments and documented dependencies
- results are collected and visualized using standardized scripts

-
+


+

BENCHMARKS

Note: Latency values are measured in milliseconds (ms). Lower values indicate better performance.
-

LAYER NORMALIZATION

+

ACTIVATION FUNCTIONS

- Layer Norm Latency + Activation Latency
@@ -4133,32 +4152,40 @@ Each section includes:
+ + + - - + + + + + - - + + + + +
Implementation DescriptionSourceHFBench
HF Kernels Layer NormHuggingFace kernels implementationHF Kernels SwiGLUHuggingFace kernels SwiGLU implementationGitHubHFBench
PyTorch Layer NormPyTorch native implementationPyTorch SwiGLUPyTorch native SwiGLU implementation--Bench

- -


-

ROTARY POSITION EMBEDDINGS

+

FLASH ATTENTION

- Rotary Position Embeddings Latency + Flash Attention Latency
@@ -4166,31 +4193,68 @@ Each section includes:
+ + + - - + + + + + - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Implementation DescriptionSourceHFBench
HF Kernels RotaryHuggingFace kernels implementationFlash AttentionTorch SDPA Flash Attention implementation--Bench
PyTorch RotaryPyTorch native implementationHF Kernels Flash Attention 2HuggingFace kernels Flash AttentionGitHubHFBench
HF Kernels Flash Attention 3HuggingFace kernels Flash Attention 3GitHubHFBench
Memory Efficient AttentionMemory efficient attention implementation-Bench
Sage AttentionSage attention implementationHFBench
xFormersxFormers attention implementationGitHub-Bench


-

FLASH ATTENTION

+

DEFORMABLE DETR

- Flash Attention Latency + Deformable DETR Latency
@@ -4198,38 +4262,72 @@ Each section includes:
+ + + - - - - - - + + + + + - - + + + + + + +
Implementation DescriptionSourceHFBench
Flash AttentionFlash Attention implementation
HF Kernels Flash AttentionHuggingFace kernels Flash AttentionHF Kernels Deformable DETRHuggingFace kernels Deformable DETR implementationGitHubHFBench
HF Kernels Flash Attention 3HuggingFace kernels Flash Attention 3PyTorch Deformable DETRPyTorch native Deformable DETR implementation--Bench
+

+ +

+ +
+

OPENAI-STYLE MOE

+
+ OpenAI MoE Latency +
+ + + - - + + + + + + + - - + + + + + - - + + + + +
Memory Efficient AttentionMemory efficient attention implementationImplementationDescriptionSourceHFBench
Sage AttentionSage attention implementationGptOssExpertsGPT OSS reference OpenAI-style MoEBench
xFormersxFormers attention implementationBinned PyTorchBinned PyTorch OpenAI-style MoE implementation--Bench

@@ -4246,16 +4344,25 @@ Each section includes:
Implementation Description +Source +HF +Bench HF Kernels Causal Conv1D HuggingFace kernels implementation +GitHub +HF +Bench PyTorch Causal Conv1D PyTorch native implementation +- +- +Bench @@ -4268,9 +4375,9 @@ Each section includes:


-

ACTIVATION FUNCTIONS

+

ROTARY POSITION EMBEDDINGS

- Activation Latency + Rotary Position Embeddings Latency
@@ -4278,28 +4385,77 @@ Each section includes:
+ + + - - + + + + + - - + + + + +
Implementation DescriptionSourceHFBench
HF Kernels SwiGLUHuggingFace kernels SwiGLU implementationHF Kernels RotaryHuggingFace kernels implementationGitHubHFBench
PyTorch SwiGLUPyTorch native SwiGLU implementationPyTorch RotaryPyTorch native implementation--Bench


+

LAYER NORMALIZATION

+
+ Layer Norm Latency +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + +
ImplementationDescriptionSourceHFBench
HF Kernels Layer NormHuggingFace kernels implementationGitHubHFBench
PyTorch Layer NormPyTorch native implementation--Bench
+

+ +

+ + + + + + +
+
+ + ← back + +
light
+
reset
+ +
+
+ +
+
Generated on:
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35 +
+
+ +
+

Binned PyTorch - OpenAI-style MoE

+

GPU Info

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: nv | 0.24s + | + +Raw +GitHub +
+
+
+
import subprocess
+print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
+
+ +
+
+
+
+
Fri Oct 31 20:00:34 2025       
++-----------------------------------------------------------------------------------------+
+| NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
+|-----------------------------------------+------------------------+----------------------+
+| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
+| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
+|                                         |                        |               MIG M. |
+|=========================================+========================+======================|
+|   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
+| N/A   34C    P0             81W /  350W |       0MiB /  46068MiB |     18%      Default |
+|                                         |                        |                  N/A |
++-----------------------------------------+------------------------+----------------------+
+
++-----------------------------------------------------------------------------------------+
+| Processes:                                                                              |
+|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
+|        ID   ID                                                               Usage      |
+|=========================================================================================|
+|  No running processes found                                                             |
++-----------------------------------------------------------------------------------------+
+
+
+
+
+ +

OpenAI-style MoE Benchmark (Binned PyTorch)

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: benchmark | 727.85s + | + +Raw +GitHub +
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "numpy",
+#     "torch==2.8.0",
+#     "kernels-benchmark-tools",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+
+
+def binned_gather(x, indices, bins, expert_capacity, top_k):
+    E, H = bins.shape[0], x.shape[1]
+    out = torch.zeros((E, expert_capacity, H), device=x.device, dtype=x.dtype)
+    for e in range(E):
+        start = 0 if e == 0 else bins[e - 1]
+        end = bins[e]
+        n = min(end - start, expert_capacity)
+        for i in range(n):
+            flat_pos = indices[start + i]
+            tok = flat_pos // top_k
+            out[e, i] = x[tok]
+    return out
+
+
+def binned_scatter(x, indices, weights, bins, expert_capacity, top_k):
+    E, C, H = x.shape
+    N = indices.shape[0] // top_k
+    out = torch.zeros((N, top_k, H), dtype=x.dtype, device=x.device)
+    for e in range(E):
+        start = 0 if e == 0 else bins[e - 1]
+        end = bins[e]
+        n = end - start
+        if n == 0:
+            continue
+        take = min(n, expert_capacity)
+        for i in range(take):
+            flat_pos = indices[start + i]  # flattened (token, slot)
+            tok = flat_pos // top_k
+            slot = flat_pos % top_k
+            scale = weights[flat_pos] if weights is not None else 1.0
+            out[tok, slot] = x[e, i] * scale
+    return out.sum(dim=1)
+
+
+def sort_tokens_by_expert(router_indices, num_experts):
+    flat_indices = router_indices.flatten()
+    sorted_values, sorted_indices = torch.sort(flat_indices)
+    tokens_per_expert = torch.bincount(sorted_values, minlength=num_experts)
+    bins = torch.cumsum(tokens_per_expert, dim=0)
+    return sorted_indices, sorted_values, bins, tokens_per_expert
+
+
+def binned_experts_ref(
+    hidden_states,
+    router_indices,
+    routing_weights,
+    gate_up_proj,
+    gate_up_proj_bias,
+    down_proj,
+    down_proj_bias,
+    expert_capacity,
+):
+    B, S, H = hidden_states.shape
+    E, K = routing_weights.shape[2], router_indices.shape[1]
+
+    indices, _, bins, _ = sort_tokens_by_expert(router_indices, E)
+    x = binned_gather(hidden_states.view(-1, H), indices, bins, expert_capacity, K)
+
+    gate_up = torch.bmm(x, gate_up_proj) + gate_up_proj_bias[..., None, :]
+    gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+
+    # clamp to limit
+    limit = 7.0
+    gate = gate.clamp(min=None, max=limit)
+    up = up.clamp(min=-limit, max=limit)
+
+    glu = gate * torch.sigmoid(gate * 1.702)
+    x = (up + 1) * glu
+    x = torch.bmm(x, down_proj) + down_proj_bias[..., None, :]
+
+    # build routing weights aligned to (token, slot)
+    flat_dense = routing_weights.view(-1, E)  # [B*S, E]
+    flat_router = router_indices.view(-1, K)  # [B*S, K]
+    selected = torch.gather(flat_dense, 1, flat_router).reshape(-1)  # [B*S*K]
+
+    # scatter back
+    y = binned_scatter(x, indices, selected, bins, expert_capacity, K)  # [B*S, H]
+
+    return y.view(B, S, H)
+
+
+def binned_torch_openai_moe(
+    hidden_states,
+    router_indices,
+    routing_weights,
+    gate_up_proj,
+    gate_up_proj_bias,
+    down_proj,
+    down_proj_bias,
+):
+    """
+    Binned PyTorch implementation of OpenAI-style MoE.
+    Sorts tokens by expert assignment for more efficient batched processing.
+    """
+    B, S = hidden_states.shape[0], hidden_states.shape[1]
+    K = router_indices.shape[1]
+
+    # Set expert_capacity to a reasonable value (max tokens per expert)
+    # Use 2x the average to handle imbalance
+    expert_capacity = (B * S * K * 2) // routing_weights.shape[2]
+
+    return binned_experts_ref(
+        hidden_states,
+        router_indices,
+        routing_weights,
+        gate_up_proj,
+        gate_up_proj_bias,
+        down_proj,
+        down_proj_bias,
+        expert_capacity,
+    )
+
+
+run_benchmark(
+    kernel_type=KernelTypeEnum.OPENAI_MOE,
+    impl_name="binned_torch",
+    impl_tags={"family": "pytorch", "backend": "eager"},
+    impl_func=binned_torch_openai_moe,
+    dtype="float32",
+)
+
+ +
+
+
+
+
Running openai_moe benchmark on cuda with 8 workloads.
+
+======================================================================
+PROFILE TRACE: binned_torch | cuda_B1_S512_E2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                           binned_torch         0.00%       0.000us         0.00%       0.000us       0.000us     906.550ms      1808.50%     906.550ms     906.550ms             1  
+                                           binned_torch        25.29%     229.728ms       100.00%     908.308ms     908.308ms       0.000us         0.00%      50.129ms      50.129ms             1  
+                                             aten::item         1.81%      16.434ms        25.66%     233.033ms      15.186us       0.000us         0.00%      15.809ms       1.030us         15345  
+                              aten::_local_scalar_dense         6.08%      55.189ms        23.85%     216.599ms      14.115us      15.808ms        31.54%      15.809ms       1.030us         15345  
+                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us      15.808ms        31.54%      15.808ms       1.030us         15345  
+                                              aten::bmm         0.02%     187.925us         0.02%     226.636us      37.773us       7.688ms        15.34%       7.688ms       1.281ms             6  
+                                ampere_sgemm_128x128_nn         0.00%       0.000us         0.00%       0.000us       0.000us       7.688ms        15.34%       7.688ms       1.281ms             6  
+                                     aten::floor_divide         5.37%      48.789ms        13.13%     119.247ms      19.409us       7.554ms        15.07%       7.554ms       1.230us          6144  
+                                            aten::copy_         3.71%      33.699ms         9.08%      82.451ms      13.394us       6.606ms        13.18%       6.607ms       1.073us          6156  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.602ms        13.17%       6.602ms       1.073us          6153  
+                                              aten::mul         3.08%      27.972ms         5.49%      49.893ms      16.194us       4.718ms         9.41%       4.718ms       1.531us          3081  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       4.471ms         8.92%       4.471ms       1.456us          3072  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       4.032ms         8.04%       4.032ms       1.312us          3072  
+                                        aten::remainder         3.03%      27.567ms         4.66%      42.309ms      13.772us       3.722ms         7.42%       3.722ms       1.212us          3072  
+                                              aten::add         2.91%      26.436ms         4.87%      44.207ms      14.575us       3.546ms         7.07%       3.546ms       1.169us          3033  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       3.524ms         7.03%       3.524ms       1.147us          3072  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       3.156ms         6.30%       3.156ms       1.042us          3030  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       1.964ms         3.92%       1.964ms       1.279us          1536  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       1.758ms         3.51%       1.758ms       1.145us          1536  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     286.305us         0.57%     286.305us      47.718us             6  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 908.315ms
+Self CUDA time total: 50.127ms
+
+
+
+======================================================================
+PROFILE TRACE: binned_torch | cuda_B1_S512_E4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                           binned_torch         0.00%       0.000us         0.00%       0.000us       0.000us     939.657ms      1760.51%     939.657ms     939.657ms             1  
+                                           binned_torch        24.72%     232.366ms       100.00%     940.175ms     940.175ms       0.000us         0.00%      53.379ms      53.379ms             1  
+                                             aten::item         1.65%      15.471ms        26.56%     249.752ms      14.748us       0.000us         0.00%      17.339ms       1.024us         16935  
+                              aten::_local_scalar_dense         6.16%      57.893ms        24.92%     234.282ms      13.834us      17.337ms        32.48%      17.339ms       1.024us         16935  
+                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us      17.337ms        32.48%      17.337ms       1.024us         16935  
+                                              aten::bmm         0.02%     191.684us         0.02%     230.777us      38.463us       7.882ms        14.77%       7.882ms       1.314ms             6  
+                                ampere_sgemm_128x128_nn         0.00%       0.000us         0.00%       0.000us       0.000us       7.882ms        14.77%       7.882ms       1.314ms             6  
+                                     aten::floor_divide         5.10%      47.974ms        12.37%     116.337ms      18.935us       7.540ms        14.13%       7.541ms       1.227us          6144  
+                                            aten::copy_         3.80%      35.738ms         9.00%      84.586ms      13.740us       6.593ms        12.35%       6.595ms       1.071us          6156  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.590ms        12.35%       6.590ms       1.071us          6153  
+                                              aten::add         4.16%      39.066ms         7.01%      65.874ms      14.342us       5.113ms         9.58%       5.113ms       1.113us          4593  
+                                              aten::mul         2.92%      27.472ms         5.20%      48.883ms      15.866us       4.715ms         8.83%       4.715ms       1.530us          3081  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       4.472ms         8.38%       4.472ms       1.456us          3072  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       4.021ms         7.53%       4.021ms       1.309us          3072  
+                                        aten::remainder         2.73%      25.664ms         4.27%      40.147ms      13.069us       3.707ms         6.95%       3.707ms       1.207us          3072  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       3.519ms         6.59%       3.519ms       1.146us          3072  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       3.178ms         5.95%       3.178ms       1.049us          3030  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       1.958ms         3.67%       1.958ms       1.275us          1536  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       1.749ms         3.28%       1.749ms       1.139us          1536  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       1.537ms         2.88%       1.537ms       0.985us          1560  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 940.182ms
+Self CUDA time total: 53.374ms
+
+
+
+======================================================================
+PROFILE TRACE: binned_torch | cuda_B1_S1024_E2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                           binned_torch         0.00%       0.000us         0.00%       0.000us       0.000us        1.751s      1703.41%        1.751s        1.751s             1  
+                                           binned_torch        24.63%     431.727ms       100.00%        1.753s        1.753s       0.000us         0.00%     102.829ms     102.829ms             1  
+                                             aten::item         1.69%      29.621ms        25.96%     455.095ms      14.915us       0.000us         0.00%      31.387ms       1.029us         30513  
+                              aten::_local_scalar_dense         5.96%     104.552ms        24.27%     425.474ms      13.944us      31.383ms        30.52%      31.387ms       1.029us         30513  
+                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us      31.383ms        30.52%      31.383ms       1.029us         30513  
+                                              aten::bmm         0.01%     224.614us         0.02%     267.595us      44.599us      15.143ms        14.73%      15.143ms       2.524ms             6  
+                                ampere_sgemm_128x128_nn         0.00%       0.000us         0.00%       0.000us       0.000us      15.143ms        14.73%      15.143ms       2.524ms             6  
+                                     aten::floor_divide         5.56%      97.549ms        13.34%     233.779ms      19.025us      15.089ms        14.68%      15.090ms       1.228us         12288  
+                                            aten::copy_         4.01%      70.283ms         9.47%     166.011ms      13.497us      13.317ms        12.95%      13.317ms       1.083us         12300  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      13.313ms        12.95%      13.313ms       1.083us         12294  
+                                              aten::mul         3.14%      55.060ms         5.66%      99.236ms      16.128us      11.295ms        10.99%      11.297ms       1.836us          6153  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       9.940ms         9.67%       9.940ms       1.618us          6144  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       8.059ms         7.84%       8.059ms       1.312us          6144  
+                                              aten::add         2.85%      49.952ms         4.90%      85.866ms      14.522us       7.505ms         7.30%       7.506ms       1.269us          5913  
+                                        aten::remainder         3.02%      53.015ms         4.74%      83.117ms      13.528us       7.414ms         7.21%       7.416ms       1.207us          6144  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       7.031ms         6.84%       7.031ms       1.144us          6144  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       6.224ms         6.05%       6.224ms       1.053us          5910  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       3.914ms         3.81%       3.914ms       1.274us          3072  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       3.500ms         3.40%       3.500ms       1.139us          3072  
+                                            aten::clamp         0.00%      71.603us         0.01%     117.833us      19.639us       1.180ms         1.15%       1.180ms     196.722us             6  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.753s
+Self CUDA time total: 102.819ms
+
+
+
+======================================================================
+PROFILE TRACE: binned_torch | cuda_B1_S1024_E4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                           binned_torch         0.00%       0.000us         0.00%       0.000us       0.000us        1.834s      1680.90%        1.834s        1.834s             1  
+                                           binned_torch        24.76%     454.393ms       100.00%        1.835s        1.835s       0.000us         0.00%     109.119ms     109.119ms             1  
+                                             aten::item         1.65%      30.229ms        26.42%     484.819ms      14.374us       0.000us         0.00%      34.734ms       1.030us         33729  
+                              aten::_local_scalar_dense         6.08%     111.551ms        24.77%     454.590ms      13.478us      34.731ms        31.83%      34.734ms       1.030us         33729  
+                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us      34.731ms        31.83%      34.731ms       1.030us         33729  
+                                              aten::bmm         0.01%     219.836us         0.01%     260.868us      43.478us      15.243ms        13.97%      15.243ms       2.540ms             6  
+                                ampere_sgemm_128x128_nn         0.00%       0.000us         0.00%       0.000us       0.000us      15.243ms        13.97%      15.243ms       2.540ms             6  
+                                     aten::floor_divide         5.37%      98.619ms        12.62%     231.581ms      18.846us      15.065ms        13.81%      15.065ms       1.226us         12288  
+                                            aten::copy_         3.65%      66.986ms         8.64%     158.623ms      12.896us      13.313ms        12.20%      13.316ms       1.083us         12300  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      13.309ms        12.20%      13.309ms       1.082us         12297  
+                                              aten::mul         2.96%      54.365ms         5.27%      96.616ms      15.702us      10.967ms        10.05%      10.969ms       1.783us          6153  
+                                              aten::add         4.05%      74.247ms         6.97%     127.934ms      14.060us      10.631ms         9.74%      10.631ms       1.168us          9099  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       9.613ms         8.81%       9.613ms       1.565us          6144  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       8.047ms         7.37%       8.047ms       1.310us          6144  
+                                        aten::remainder         2.81%      51.641ms         4.37%      80.193ms      13.052us       7.438ms         6.82%       7.438ms       1.211us          6144  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       7.018ms         6.43%       7.018ms       1.142us          6144  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       6.225ms         5.71%       6.225ms       1.053us          5910  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       3.928ms         3.60%       3.928ms       1.279us          3072  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       3.510ms         3.22%       3.510ms       1.143us          3072  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       3.154ms         2.89%       3.154ms       0.990us          3186  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.835s
+Self CUDA time total: 109.111ms
+
+
+
+======================================================================
+PROFILE TRACE: binned_torch | cuda_B4_S512_E2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                           binned_torch         0.00%       0.000us         0.00%       0.000us       0.000us        3.518s      1672.53%        3.518s        3.518s             1  
+                                           binned_torch        24.37%     858.118ms       100.00%        3.521s        3.521s       0.000us         0.00%     210.357ms     210.357ms             1  
+                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us      63.177ms        30.04%      63.177ms       1.026us         61586  
+                                             aten::item         1.69%      59.432ms        26.02%     916.275ms      14.878us       0.000us         0.00%      63.177ms       1.026us         61587  
+                              aten::_local_scalar_dense         5.96%     209.806ms        24.34%     856.843ms      13.913us      63.176ms        30.03%      63.177ms       1.026us         61587  
+                                     aten::floor_divide         5.42%     190.698ms        13.50%     475.217ms      19.337us      30.482ms        14.49%      30.486ms       1.240us         24576  
+                                              aten::bmm         0.01%     235.397us         0.01%     281.998us      47.000us      29.291ms        13.93%      29.291ms       4.882ms             6  
+                                ampere_sgemm_128x128_nn         0.00%       0.000us         0.00%       0.000us       0.000us      29.291ms        13.93%      29.291ms       4.882ms             6  
+                                            aten::copy_         3.77%     132.744ms         9.15%     322.282ms      13.107us      26.808ms        12.75%      26.810ms       1.090us         24588  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      26.805ms        12.74%      26.805ms       1.090us         24582  
+                                              aten::mul         3.15%     110.895ms         5.78%     203.457ms      16.545us      25.566ms        12.15%      25.568ms       2.079us         12297  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.101ms        10.51%      22.101ms       1.799us         12288  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.470ms         7.83%      16.470ms       1.340us         12288  
+                                              aten::add         2.99%     105.439ms         5.15%     181.211ms      14.601us      16.115ms         7.66%      16.116ms       1.298us         12411  
+                                        aten::remainder         2.99%     105.111ms         4.72%     166.195ms      13.525us      14.836ms         7.05%      14.838ms       1.208us         12288  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      14.014ms         6.66%      14.014ms       1.140us         12288  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      12.996ms         6.18%      12.996ms       1.047us         12408  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       7.830ms         3.72%       7.830ms       1.274us          6144  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       7.006ms         3.33%       7.006ms       1.140us          6144  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.626ms         1.25%       2.626ms     437.595us             6  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 3.521s
+Self CUDA time total: 210.342ms
+
+
+
+======================================================================
+PROFILE TRACE: binned_torch | cuda_B4_S512_E4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                           binned_torch         0.00%       0.000us         0.00%       0.000us       0.000us        3.742s      1679.57%        3.742s        3.742s             1  
+                                           binned_torch        24.42%     914.204ms       100.00%        3.744s        3.744s       0.000us         0.00%     222.834ms     222.834ms             1  
+                                             aten::item         1.73%      64.729ms        26.53%     993.125ms      14.638us       0.000us         0.00%      69.848ms       1.030us         67845  
+                              aten::_local_scalar_dense         6.14%     229.850ms        24.80%     928.396ms      13.684us      69.844ms        31.35%      69.848ms       1.030us         67845  
+                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us      69.844ms        31.35%      69.844ms       1.030us         67841  
+                                     aten::floor_divide         5.29%     197.931ms        12.52%     468.921ms      19.080us      30.509ms        13.69%      30.515ms       1.242us         24576  
+                                ampere_sgemm_128x128_nn         0.00%       0.000us         0.00%       0.000us       0.000us      29.140ms        13.08%      29.140ms       4.857ms             6  
+                                              aten::bmm         0.01%     232.675us         0.01%     273.538us      45.590us      29.140ms        13.08%      29.140ms       4.857ms             6  
+                                            aten::copy_         3.66%     136.881ms         8.73%     326.908ms      13.295us      26.646ms        11.96%      26.647ms       1.084us         24588  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      26.643ms        11.96%      26.643ms       1.084us         24581  
+                                              aten::mul         2.96%     110.832ms         5.24%     196.253ms      15.959us      25.520ms        11.45%      25.522ms       2.075us         12297  
+                                              aten::add         4.16%     155.619ms         7.13%     266.948ms      14.322us      22.169ms         9.95%      22.169ms       1.189us         18639  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.076ms         9.91%      22.076ms       1.797us         12288  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.462ms         7.39%      16.462ms       1.340us         12287  
+                                        aten::remainder         2.77%     103.887ms         4.33%     162.240ms      13.203us      14.877ms         6.68%      14.879ms       1.211us         12288  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      14.047ms         6.30%      14.047ms       1.143us         12287  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      12.957ms         5.82%      12.957ms       1.044us         12407  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       7.856ms         3.53%       7.856ms       1.279us          6144  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       7.021ms         3.15%       7.021ms       1.143us          6144  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.109ms         2.74%       6.109ms       0.981us          6228  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 3.744s
+Self CUDA time total: 222.814ms
+
+
+
+======================================================================
+PROFILE TRACE: binned_torch | cuda_B4_S1024_E2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                           binned_torch         0.00%       0.000us         0.00%       0.000us       0.000us        6.967s      1665.27%        6.967s        6.967s             1  
+                                           binned_torch        24.68%        1.721s       100.00%        6.973s        6.973s       0.000us         0.00%     418.392ms     418.392ms             1  
+                                             aten::item         1.64%     114.231ms        25.94%        1.809s      14.732us       0.000us         0.00%     125.163ms       1.020us        122763  
+                              aten::_local_scalar_dense         5.97%     416.624ms        24.30%        1.694s      13.802us     125.151ms        29.91%     125.163ms       1.020us        122763  
+                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us     125.151ms        29.91%     125.151ms       1.019us        122762  
+                                     aten::floor_divide         5.62%     391.846ms        13.33%     929.253ms      18.906us      61.051ms        14.59%      61.053ms       1.242us         49152  
+                                ampere_sgemm_128x128_nn         0.00%       0.000us         0.00%       0.000us       0.000us      57.281ms        13.69%      57.281ms       9.547ms             6  
+                                              aten::bmm         0.00%     234.996us         0.00%     276.787us      46.131us      57.281ms        13.69%      57.281ms       9.547ms             6  
+                                            aten::copy_         3.92%     273.517ms         9.35%     652.240ms      13.268us      53.435ms        12.77%      53.437ms       1.087us         49158  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.433ms        12.77%      53.433ms       1.087us         49154  
+                                              aten::mul         3.15%     219.950ms         5.62%     391.612ms      15.929us      51.411ms        12.29%      51.419ms       2.091us         24585  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      44.451ms        10.62%      44.451ms       1.809us         24576  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      32.993ms         7.89%      32.993ms       1.343us         24576  
+                                              aten::add         2.87%     200.428ms         4.94%     344.166ms      14.085us      31.887ms         7.62%      31.889ms       1.305us         24435  
+                                        aten::remainder         3.00%     208.953ms         4.67%     325.902ms      13.261us      29.680ms         7.09%      29.684ms       1.208us         24576  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      28.059ms         6.71%      28.059ms       1.142us         24576  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      25.247ms         6.03%      25.247ms       1.033us         24431  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      15.667ms         3.74%      15.667ms       1.275us         12288  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      14.014ms         3.35%      14.014ms       1.140us         12288  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       5.233ms         1.25%       5.233ms     872.184us             6  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 6.973s
+Self CUDA time total: 418.361ms
+
+
+
+======================================================================
+PROFILE TRACE: binned_torch | cuda_B4_S1024_E4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                           binned_torch         0.00%       0.000us         0.00%       0.000us       0.000us        7.368s      1660.72%        7.368s        7.368s             1  
+                                           binned_torch        24.39%        1.797s       100.00%        7.370s        7.370s       0.000us         0.00%     443.698ms     443.698ms             1  
+                                             aten::item         1.69%     124.742ms        26.51%        1.954s      14.504us       0.000us         0.00%     137.717ms       1.022us        134715  
+                              aten::_local_scalar_dense         6.11%     450.407ms        24.82%        1.829s      13.577us     137.708ms        31.04%     137.717ms       1.022us        134715  
+                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us     137.710ms        31.04%     137.710ms       1.022us        134711  
+                                     aten::floor_divide         5.42%     399.563ms        12.65%     932.414ms      18.970us      61.071ms        13.77%      61.077ms       1.243us         49152  
+                                              aten::bmm         0.00%     230.664us         0.00%     272.466us      45.411us      57.304ms        12.92%      57.304ms       9.551ms             6  
+                                ampere_sgemm_128x128_nn         0.00%       0.000us         0.00%       0.000us       0.000us      57.304ms        12.92%      57.304ms       9.551ms             6  
+                                            aten::copy_         3.65%     269.132ms         8.67%     639.259ms      13.004us      54.065ms        12.19%      54.067ms       1.100us         49158  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      54.062ms        12.19%      54.062ms       1.100us         49153  
+                                              aten::mul         2.96%     217.959ms         5.26%     387.551ms      15.764us      51.653ms        11.64%      51.660ms       2.101us         24585  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      44.653ms        10.06%      44.653ms       1.817us         24576  
+                                              aten::add         4.03%     296.962ms         6.96%     512.647ms      14.100us      43.690ms         9.85%      43.694ms       1.202us         36357  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      32.954ms         7.43%      32.954ms       1.341us         24575  
+                                        aten::remainder         2.83%     208.527ms         4.40%     323.906ms      13.180us      29.662ms         6.69%      29.664ms       1.207us         24576  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      28.119ms         6.34%      28.119ms       1.144us         24576  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      25.409ms         5.73%      25.409ms       1.040us         24431  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      15.666ms         3.53%      15.666ms       1.275us         12288  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      13.995ms         3.15%      13.995ms       1.139us         12288  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      11.644ms         2.62%      11.644ms       0.977us         11922  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 7.370s
+Self CUDA time total: 443.660ms
+
+
+impl                     wl                  p50(ms)  ok
+binned_torch             cuda_B1_S1024_E2     372.79  True
+binned_torch             cuda_B1_S1024_E4     382.68  True
+binned_torch             cuda_B1_S512_E2      150.05  True
+binned_torch             cuda_B1_S512_E4      200.26  True
+binned_torch             cuda_B4_S1024_E2    1486.48  True
+binned_torch             cuda_B4_S1024_E4    1524.50  True
+binned_torch             cuda_B4_S512_E2      742.02  True
+binned_torch             cuda_B4_S512_E4      801.90  True
+
+
+

Artifacts:

+openai_moe.jsonl +
+
+
+
+ + + \ No newline at end of file diff --git a/openai_moe/impls/cells/benchmark.py b/openai_moe/impls/cells/benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..1527168c6489fe70597cc9c4a6625c220d6a5e20 --- /dev/null +++ b/openai_moe/impls/cells/benchmark.py @@ -0,0 +1,136 @@ +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "numpy", +# "torch==2.8.0", +# "kernels-benchmark-tools", +# ] +# +# [tool.uv.sources] +# kernels-benchmark-tools = { path = "../../../../../tools", editable = true } +# /// +import torch +import sys +from kernels_benchmark_tools import KernelTypeEnum, run_benchmark + + +def binned_gather(x, indices, bins, expert_capacity, top_k): + E, H = bins.shape[0], x.shape[1] + out = torch.zeros((E, expert_capacity, H), device=x.device, dtype=x.dtype) + for e in range(E): + start = 0 if e == 0 else bins[e - 1] + end = bins[e] + n = min(end - start, expert_capacity) + for i in range(n): + flat_pos = indices[start + i] + tok = flat_pos // top_k + out[e, i] = x[tok] + return out + + +def binned_scatter(x, indices, weights, bins, expert_capacity, top_k): + E, C, H = x.shape + N = indices.shape[0] // top_k + out = torch.zeros((N, top_k, H), dtype=x.dtype, device=x.device) + for e in range(E): + start = 0 if e == 0 else bins[e - 1] + end = bins[e] + n = end - start + if n == 0: + continue + take = min(n, expert_capacity) + for i in range(take): + flat_pos = indices[start + i] # flattened (token, slot) + tok = flat_pos // top_k + slot = flat_pos % top_k + scale = weights[flat_pos] if weights is not None else 1.0 + out[tok, slot] = x[e, i] * scale + return out.sum(dim=1) + + +def sort_tokens_by_expert(router_indices, num_experts): + flat_indices = router_indices.flatten() + sorted_values, sorted_indices = torch.sort(flat_indices) + tokens_per_expert = torch.bincount(sorted_values, minlength=num_experts) + bins = torch.cumsum(tokens_per_expert, dim=0) + return sorted_indices, sorted_values, bins, tokens_per_expert + + +def binned_experts_ref( + hidden_states, + router_indices, + routing_weights, + gate_up_proj, + gate_up_proj_bias, + down_proj, + down_proj_bias, + expert_capacity, +): + B, S, H = hidden_states.shape + E, K = routing_weights.shape[2], router_indices.shape[1] + + indices, _, bins, _ = sort_tokens_by_expert(router_indices, E) + x = binned_gather(hidden_states.view(-1, H), indices, bins, expert_capacity, K) + + gate_up = torch.bmm(x, gate_up_proj) + gate_up_proj_bias[..., None, :] + gate, up = gate_up[..., ::2], gate_up[..., 1::2] + + # clamp to limit + limit = 7.0 + gate = gate.clamp(min=None, max=limit) + up = up.clamp(min=-limit, max=limit) + + glu = gate * torch.sigmoid(gate * 1.702) + x = (up + 1) * glu + x = torch.bmm(x, down_proj) + down_proj_bias[..., None, :] + + # build routing weights aligned to (token, slot) + flat_dense = routing_weights.view(-1, E) # [B*S, E] + flat_router = router_indices.view(-1, K) # [B*S, K] + selected = torch.gather(flat_dense, 1, flat_router).reshape(-1) # [B*S*K] + + # scatter back + y = binned_scatter(x, indices, selected, bins, expert_capacity, K) # [B*S, H] + + return y.view(B, S, H) + + +def binned_torch_openai_moe( + hidden_states, + router_indices, + routing_weights, + gate_up_proj, + gate_up_proj_bias, + down_proj, + down_proj_bias, +): + """ + Binned PyTorch implementation of OpenAI-style MoE. + Sorts tokens by expert assignment for more efficient batched processing. + """ + B, S = hidden_states.shape[0], hidden_states.shape[1] + K = router_indices.shape[1] + + # Set expert_capacity to a reasonable value (max tokens per expert) + # Use 2x the average to handle imbalance + expert_capacity = (B * S * K * 2) // routing_weights.shape[2] + + return binned_experts_ref( + hidden_states, + router_indices, + routing_weights, + gate_up_proj, + gate_up_proj_bias, + down_proj, + down_proj_bias, + expert_capacity, + ) + + +run_benchmark( + kernel_type=KernelTypeEnum.OPENAI_MOE, + impl_name="binned_torch", + impl_tags={"family": "pytorch", "backend": "eager"}, + impl_func=binned_torch_openai_moe, + dtype="float32", +) \ No newline at end of file diff --git a/openai_moe/impls/cells/nv.py b/openai_moe/impls/cells/nv.py new file mode 100644 index 0000000000000000000000000000000000000000..c9b2d5c1f9f3850dc1d3092240fd46853e7ed0c5 --- /dev/null +++ b/openai_moe/impls/cells/nv.py @@ -0,0 +1,2 @@ +import subprocess +print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout) \ No newline at end of file diff --git a/openai_moe/impls/gpt_oss_moe.html b/openai_moe/impls/gpt_oss_moe.html new file mode 100644 index 0000000000000000000000000000000000000000..2133ccac99c4f05bc3163b7f04c006955d4539b2 --- /dev/null +++ b/openai_moe/impls/gpt_oss_moe.html @@ -0,0 +1,4545 @@ + + + + + + gpt_oss_moe + + + + + + + + + + + + +
+
+ + ← back + +
light
+
reset
+ +
+
+ +
+
Generated on:
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35 +
+
+ +
+

GptOssExperts - OpenAI-style MoE

+

GPU Info

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: nv | 0.24s + | + +Raw +GitHub +🤗 HF +
+
+
+
import subprocess
+print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
+
+ +
+
+
+
+
Fri Oct 31 20:00:34 2025       
++-----------------------------------------------------------------------------------------+
+| NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
+|-----------------------------------------+------------------------+----------------------+
+| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
+| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
+|                                         |                        |               MIG M. |
+|=========================================+========================+======================|
+|   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
+| N/A   34C    P0             81W /  350W |       0MiB /  46068MiB |     18%      Default |
+|                                         |                        |                  N/A |
++-----------------------------------------+------------------------+----------------------+
+
++-----------------------------------------------------------------------------------------+
+| Processes:                                                                              |
+|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
+|        ID   ID                                                               Usage      |
+|=========================================================================================|
+|  No running processes found                                                             |
++-----------------------------------------------------------------------------------------+
+
+
+
+
+ +

OpenAI-style MoE Benchmark (GptOssExperts Reference)

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: benchmark | 24.32s + | + +Raw +GitHub +🤗 HF +
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "numpy",
+#     "torch==2.8.0",
+#     "kernels-benchmark-tools",
+#     "kernels",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# kernels = { git = "https://github.com/huggingface/kernels.git" }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+from kernels import get_kernel
+
+# Load yamoe to get GptOssExperts reference
+yamoe = get_kernel("drbh/yamoe", revision="v0.2.0")
+GptOssExperts = yamoe.vendored.gpt_oss_mlp.GptOssExperts
+
+
+def gpt_oss_openai_moe(
+    hidden_states,
+    router_indices,
+    routing_weights,
+    gate_up_proj,
+    gate_up_proj_bias,
+    down_proj,
+    down_proj_bias,
+):
+    """
+    GptOssExperts reference implementation of OpenAI-style MoE.
+    This is the reference model implementation from the original GPT OSS codebase.
+    """
+    B, S, H = hidden_states.shape
+    E = routing_weights.shape[2]
+
+    # Create a config object for GptOssExperts
+    config = type("Config", (), {})()
+    config.hidden_size = H
+    config.intermediate_size = gate_up_proj.shape[2] // 2  # expert_dim / 2 = H
+    config.num_local_experts = E
+
+    # Initialize model
+    model = GptOssExperts(config)
+
+    # Set weights from benchmark inputs
+    model.gate_up_proj.data = gate_up_proj
+    model.gate_up_proj_bias.data = gate_up_proj_bias
+    model.down_proj.data = down_proj
+    model.down_proj_bias.data = down_proj_bias
+
+    model = model.to(hidden_states.device)
+    model.eval()
+
+    # Force GptOssExperts to use CPU path for correctness (matches naive_moe_ref behavior)
+    # The GPU path processes all experts which can lead to numerical differences
+    # CPU path explicitly uses router_indices like the reference implementation
+    model.train()  # Force CPU path
+
+    # Flatten routing_weights to [batch_seq, num_experts]
+    routing_weights_flat = routing_weights.view(-1, E)
+
+    # Run forward pass
+    with torch.no_grad():
+        output = model(hidden_states, router_indices, routing_weights_flat)
+
+    model.eval()  # Reset to eval mode
+
+    return output
+
+
+run_benchmark(
+    kernel_type=KernelTypeEnum.OPENAI_MOE,
+    impl_name="gpt_oss_experts",
+    impl_tags={"family": "reference", "backend": "pytorch"},
+    impl_func=gpt_oss_openai_moe,
+    dtype="float32",
+)
+
+ +
+
+
+
+
Running openai_moe benchmark on cuda with 8 workloads.
+
+======================================================================
+PROFILE TRACE: gpt_oss_experts | cuda_B1_S512_E2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                        gpt_oss_experts         0.00%       0.000us         0.00%       0.000us       0.000us      10.211ms       197.81%      10.211ms      10.211ms             1  
+                                        gpt_oss_experts        16.48%       2.023ms        99.94%      12.270ms      12.270ms       0.000us         0.00%       5.165ms       5.165ms             1  
+                                           aten::matmul         0.22%      26.489us         3.82%     468.520us      39.043us       0.000us         0.00%       4.540ms     378.357us            12  
+                                               aten::mm         2.36%     289.825us         3.60%     442.031us      36.836us       4.540ms        87.96%       4.540ms     378.357us            12  
+                                 ampere_sgemm_128x64_nn         0.00%       0.000us         0.00%       0.000us       0.000us       3.078ms        59.62%       3.078ms     341.948us             9  
+void cutlass::Kernel2<cutlass_80_simt_sgemm_128x64_8...         0.00%       0.000us         0.00%       0.000us       0.000us       1.457ms        28.23%       1.457ms     485.813us             3  
+                                              aten::mul         1.42%     174.948us         2.34%     287.701us      11.988us     109.119us         2.11%     109.119us       4.547us            24  
+                                              aten::add         1.61%     197.786us         3.85%     472.357us      26.242us     103.039us         2.00%     103.039us       5.724us            18  
+                                            aten::index         1.73%     212.127us         2.86%     350.900us      29.242us      86.591us         1.68%      86.591us       7.216us            12  
+                                       aten::index_add_         0.51%      62.499us         0.79%      97.312us      16.219us      82.688us         1.60%      82.688us      13.781us             6  
+void at::native::indexFuncLargeIndex<float, long, un...         0.00%       0.000us         0.00%       0.000us       0.000us      82.688us         1.60%      82.688us      13.781us             6  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      80.511us         1.56%      80.511us       6.709us            12  
+                                          aten::nonzero         2.20%     270.146us         6.58%     808.380us      89.820us      63.743us         1.23%      74.368us       8.263us             9  
+                                            aten::clamp         0.98%     120.045us         1.63%     200.026us      16.669us      64.705us         1.25%      64.705us       5.392us            12  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      64.705us         1.25%      64.705us       5.392us            12  
+                                            aten::where         0.06%       7.400us         5.25%     644.007us     107.334us       0.000us         0.00%      60.384us      10.064us             6  
+                                    aten::nonzero_numpy         0.11%      13.320us         5.19%     636.607us     106.101us       0.000us         0.00%      60.384us      10.064us             6  
+void at::native::vectorized_gather_kernel<16, long>(...         0.00%       0.000us         0.00%       0.000us       0.000us      60.063us         1.16%      60.063us      10.011us             6  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      56.800us         1.10%      56.800us       4.733us            12  
+                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us      50.911us         0.99%      50.911us       1.131us            45  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 12.278ms
+Self CUDA time total: 5.162ms
+
+
+
+======================================================================
+PROFILE TRACE: gpt_oss_experts | cuda_B1_S512_E4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                        gpt_oss_experts         0.00%       0.000us         0.00%       0.000us       0.000us      13.933ms       229.38%      13.933ms      13.933ms             1  
+                                        gpt_oss_experts        16.29%       2.560ms        99.97%      15.712ms      15.712ms       0.000us         0.00%       6.077ms       6.077ms             1  
+                                           aten::matmul         0.30%      47.223us         5.17%     812.581us      33.858us       0.000us         0.00%       5.268ms     219.512us            24  
+                                               aten::mm         3.09%     485.951us         4.87%     765.358us      31.890us       5.268ms        86.73%       5.268ms     219.512us            24  
+                                 ampere_sgemm_128x64_nn         0.00%       0.000us         0.00%       0.000us       0.000us       5.213ms        85.81%       5.213ms     217.198us            24  
+                                          aten::nonzero         2.45%     385.408us         7.89%       1.240ms      82.649us     112.163us         1.85%     134.498us       8.967us            15  
+                                              aten::mul         2.03%     318.275us         3.36%     528.222us      11.005us     130.496us         2.15%     130.496us       2.719us            48  
+                                              aten::add         2.25%     353.820us         3.74%     587.771us      16.327us     127.072us         2.09%     127.072us       3.530us            36  
+                                            aten::where         0.08%      11.882us         7.49%       1.177ms      98.080us       0.000us         0.00%     120.705us      10.059us            12  
+                                    aten::nonzero_numpy         0.15%      24.083us         7.41%       1.165ms      97.090us       0.000us         0.00%     120.705us      10.059us            12  
+                                            aten::index         2.31%     363.442us         3.93%     617.030us      25.710us     110.145us         1.81%     110.145us       4.589us            24  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     101.312us         1.67%     101.312us       4.221us            24  
+                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us      91.447us         1.51%      91.447us       1.051us            87  
+                                            aten::clamp         1.32%     207.076us         2.26%     355.011us      14.792us      85.793us         1.41%      85.793us       3.575us            24  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      85.793us         1.41%      85.793us       3.575us            24  
+                                             aten::item         0.52%      81.620us        38.60%       6.066ms      84.255us       0.000us         0.00%      75.446us       1.048us            72  
+                              aten::_local_scalar_dense         2.00%     315.046us        38.08%       5.985ms      83.122us      75.446us         1.24%      75.446us       1.048us            72  
+                                       aten::index_add_         0.75%     118.511us         1.16%     182.084us      15.174us      72.926us         1.20%      72.926us       6.077us            12  
+void at::native::indexFuncLargeIndex<float, long, un...         0.00%       0.000us         0.00%       0.000us       0.000us      72.926us         1.20%      72.926us       6.077us            12  
+void at::native::vectorized_gather_kernel<16, long>(...         0.00%       0.000us         0.00%       0.000us       0.000us      65.857us         1.08%      65.857us       5.488us            12  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 15.717ms
+Self CUDA time total: 6.074ms
+
+
+
+======================================================================
+PROFILE TRACE: gpt_oss_experts | cuda_B1_S1024_E2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                        gpt_oss_experts         0.00%       0.000us         0.00%       0.000us       0.000us      12.540ms       148.48%      12.540ms      12.540ms             1  
+                                        gpt_oss_experts        11.83%       1.734ms        99.96%      14.654ms      14.654ms       0.000us         0.00%       8.451ms       8.451ms             1  
+                                           aten::matmul         0.16%      23.602us         3.00%     439.592us      36.633us       0.000us         0.00%       7.417ms     618.087us            12  
+                                               aten::mm         1.78%     261.037us         2.84%     415.990us      34.666us       7.417ms        87.82%       7.417ms     618.087us            12  
+void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_...         0.00%       0.000us         0.00%       0.000us       0.000us       4.532ms        53.65%       4.532ms     755.263us             6  
+                                 ampere_sgemm_128x64_nn         0.00%       0.000us         0.00%       0.000us       0.000us       1.475ms        17.46%       1.475ms     491.509us             3  
+void cutlass::Kernel2<cutlass_80_simt_sgemm_128x64_8...         0.00%       0.000us         0.00%       0.000us       0.000us       1.405ms        16.64%       1.405ms     468.490us             3  
+                                              aten::mul         1.05%     153.262us         1.78%     261.173us      10.882us     197.791us         2.34%     197.791us       8.241us            24  
+                                              aten::add         1.26%     184.574us         2.07%     304.007us      16.889us     188.543us         2.23%     188.543us      10.475us            18  
+                                       aten::index_add_         0.35%      50.951us         0.57%      83.553us      13.925us     169.408us         2.01%     169.408us      28.235us             6  
+void at::native::indexFuncLargeIndex<float, long, un...         0.00%       0.000us         0.00%       0.000us       0.000us     169.408us         2.01%     169.408us      28.235us             6  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     149.663us         1.77%     149.663us      12.472us            12  
+                                            aten::index         1.27%     186.102us         2.16%     316.927us      26.411us     146.942us         1.74%     146.942us      12.245us            12  
+void at::native::vectorized_gather_kernel<16, long>(...         0.00%       0.000us         0.00%       0.000us       0.000us     117.440us         1.39%     117.440us      19.573us             6  
+                                            aten::clamp         0.71%     104.743us         1.22%     178.924us      14.910us     110.912us         1.31%     110.912us       9.243us            12  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     110.912us         1.31%     110.912us       9.243us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     104.864us         1.24%     104.864us       8.739us            12  
+                                          aten::nonzero         1.58%     232.211us         4.94%     724.348us      80.483us      69.633us         0.82%      81.377us       9.042us             9  
+                                            aten::where         0.04%       6.259us         4.08%     597.684us      99.614us       0.000us         0.00%      66.816us      11.136us             6  
+                                    aten::nonzero_numpy         0.08%      11.999us         4.03%     591.425us      98.571us       0.000us         0.00%      66.816us      11.136us             6  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 14.659ms
+Self CUDA time total: 8.446ms
+
+
+
+======================================================================
+PROFILE TRACE: gpt_oss_experts | cuda_B1_S1024_E4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                        gpt_oss_experts         0.00%       0.000us         0.00%       0.000us       0.000us      18.317ms       174.31%      18.317ms      18.317ms             1  
+                                        gpt_oss_experts        13.54%       2.761ms        99.97%      20.385ms      20.385ms       0.000us         0.00%      10.514ms      10.514ms             1  
+                                           aten::matmul         0.23%      47.082us         4.02%     819.853us      34.161us       0.000us         0.00%       9.237ms     384.865us            24  
+                                               aten::mm         2.37%     482.255us         3.79%     772.771us      32.199us       9.237ms        87.90%       9.237ms     384.865us            24  
+                                 ampere_sgemm_128x64_nn         0.00%       0.000us         0.00%       0.000us       0.000us       6.282ms        59.78%       6.282ms     349.001us            18  
+void cutlass::Kernel2<cutlass_80_simt_sgemm_128x64_8...         0.00%       0.000us         0.00%       0.000us       0.000us       2.944ms        28.01%       2.944ms     490.655us             6  
+                                              aten::mul         1.50%     305.331us         2.55%     520.818us      10.850us     235.298us         2.24%     235.298us       4.902us            48  
+                                              aten::add         1.72%     351.113us         2.86%     584.036us      16.223us     213.502us         2.03%     213.502us       5.931us            36  
+                                            aten::index         1.95%     397.314us         3.28%     668.454us      27.852us     205.349us         1.95%     205.349us       8.556us            24  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     166.720us         1.59%     166.720us       6.947us            24  
+                                       aten::index_add_         0.50%     101.340us         0.81%     165.573us      13.798us     155.585us         1.48%     155.585us      12.965us            12  
+void at::native::indexFuncLargeIndex<float, long, un...         0.00%       0.000us         0.00%       0.000us       0.000us     155.585us         1.48%     155.585us      12.965us            12  
+void at::native::vectorized_gather_kernel<16, long>(...         0.00%       0.000us         0.00%       0.000us       0.000us     146.947us         1.40%     146.947us      12.246us            12  
+                                          aten::nonzero         1.95%     398.176us         6.26%       1.276ms      85.090us     121.380us         1.16%     145.668us       9.711us            15  
+                                            aten::clamp         1.04%     212.193us         1.79%     365.180us      15.216us     134.239us         1.28%     134.239us       5.593us            24  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     134.239us         1.28%     134.239us       5.593us            24  
+                                            aten::where         0.06%      11.340us         5.97%       1.216ms     101.373us       0.000us         0.00%     131.522us      10.960us            12  
+                                    aten::nonzero_numpy         0.12%      24.140us         5.91%       1.205ms     100.428us       0.000us         0.00%     131.522us      10.960us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     119.840us         1.14%     119.840us       4.993us            24  
+                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us     100.830us         0.96%     100.830us       1.159us            87  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 20.390ms
+Self CUDA time total: 10.509ms
+
+
+
+======================================================================
+PROFILE TRACE: gpt_oss_experts | cuda_B4_S512_E2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                        gpt_oss_experts         0.00%       0.000us         0.00%       0.000us       0.000us      21.031ms       119.92%      21.031ms      21.031ms             1  
+                                        gpt_oss_experts         7.59%       1.747ms        99.98%      23.024ms      23.024ms       0.000us         0.00%      17.548ms      17.548ms             1  
+                                           aten::matmul         0.10%      23.660us         1.94%     446.020us      37.168us       0.000us         0.00%      14.659ms       1.222ms            12  
+                                               aten::mm         1.17%     268.524us         1.83%     422.360us      35.197us      14.659ms        83.59%      14.659ms       1.222ms            12  
+void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_...         0.00%       0.000us         0.00%       0.000us       0.000us       8.967ms        51.13%       8.967ms       1.495ms             6  
+                                 ampere_sgemm_128x64_nn         0.00%       0.000us         0.00%       0.000us       0.000us       5.685ms        32.42%       5.685ms     947.562us             6  
+                                              aten::add         0.82%     187.722us         1.36%     312.616us      17.368us     785.408us         4.48%     785.408us      43.634us            18  
+                                              aten::mul         0.68%     156.369us         1.15%     264.222us      11.009us     674.688us         3.85%     674.688us      28.112us            24  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     504.575us         2.88%     504.575us      42.048us            12  
+                                       aten::index_add_         0.22%      50.951us         0.37%      86.132us      14.355us     448.545us         2.56%     448.545us      74.757us             6  
+void at::native::indexFuncLargeIndex<float, long, un...         0.00%       0.000us         0.00%       0.000us       0.000us     448.545us         2.56%     448.545us      74.757us             6  
+                                            aten::clamp         0.46%     107.053us         0.80%     183.295us      15.275us     336.000us         1.92%     336.000us      28.000us            12  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     336.000us         1.92%     336.000us      28.000us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     314.239us         1.79%     314.239us      52.373us             6  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     280.833us         1.60%     280.833us      46.806us             6  
+                                            aten::index         0.81%     185.806us         1.39%     320.548us      26.712us     259.102us         1.48%     259.102us      21.592us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     258.944us         1.48%     258.944us      21.579us            12  
+void at::native::vectorized_gather_kernel<16, long>(...         0.00%       0.000us         0.00%       0.000us       0.000us     225.407us         1.29%     225.407us      37.568us             6  
+                                          aten::sigmoid         0.16%      36.131us         0.27%      61.901us      10.317us     175.073us         1.00%     175.073us      29.179us             6  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     175.073us         1.00%     175.073us      29.179us             6  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 23.030ms
+Self CUDA time total: 17.537ms
+
+
+
+======================================================================
+PROFILE TRACE: gpt_oss_experts | cuda_B4_S512_E4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                        gpt_oss_experts         0.00%       0.000us         0.00%       0.000us       0.000us      24.377ms       140.11%      24.377ms      24.377ms             1  
+                                        gpt_oss_experts        10.50%       2.651ms        99.98%      25.237ms      25.237ms       0.000us         0.00%      17.408ms      17.408ms             1  
+                                           aten::matmul         0.19%      47.519us         3.41%     860.801us      35.867us       0.000us         0.00%      15.185ms     632.705us            24  
+                                               aten::mm         2.06%     521.061us         3.22%     813.282us      33.887us      15.185ms        87.28%      15.185ms     632.705us            24  
+void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_...         0.00%       0.000us         0.00%       0.000us       0.000us       9.179ms        52.76%       9.179ms     764.922us            12  
+                                 ampere_sgemm_128x64_nn         0.00%       0.000us         0.00%       0.000us       0.000us       3.124ms        17.96%       3.124ms     520.682us             6  
+void cutlass::Kernel2<cutlass_80_simt_sgemm_128x64_8...         0.00%       0.000us         0.00%       0.000us       0.000us       2.871ms        16.50%       2.871ms     478.432us             6  
+                                              aten::add         1.42%     359.495us         2.37%     598.003us      16.611us     427.713us         2.46%     427.713us      11.881us            36  
+                                              aten::mul         1.23%     309.946us         2.09%     527.073us      10.981us     420.510us         2.42%     420.510us       8.761us            48  
+                                       aten::index_add_         0.40%     101.283us         0.66%     166.886us      13.907us     383.489us         2.20%     383.489us      31.957us            12  
+void at::native::indexFuncLargeIndex<float, long, un...         0.00%       0.000us         0.00%       0.000us       0.000us     383.489us         2.20%     383.489us      31.957us            12  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     343.712us         1.98%     343.712us      14.321us            24  
+                                            aten::index         1.56%     393.991us         2.62%     662.158us      27.590us     337.086us         1.94%     337.086us      14.045us            24  
+void at::native::vectorized_gather_kernel<16, long>(...         0.00%       0.000us         0.00%       0.000us       0.000us     272.926us         1.57%     272.926us      22.744us            12  
+                                            aten::clamp         0.84%     212.993us         1.44%     363.038us      15.127us     230.431us         1.32%     230.431us       9.601us            24  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     230.431us         1.32%     230.431us       9.601us            24  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     223.071us         1.28%     223.071us       9.295us            24  
+                                          aten::nonzero         1.57%     395.401us         5.00%       1.262ms      84.127us     128.836us         0.74%     156.164us      10.411us            15  
+                                            aten::where         0.05%      12.011us         4.77%       1.205ms     100.378us       0.000us         0.00%     140.900us      11.742us            12  
+                                    aten::nonzero_numpy         0.10%      25.021us         4.72%       1.193ms      99.377us       0.000us         0.00%     140.900us      11.742us            12  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 25.242ms
+Self CUDA time total: 17.398ms
+
+
+
+======================================================================
+PROFILE TRACE: gpt_oss_experts | cuda_B4_S1024_E2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                        gpt_oss_experts         0.00%       0.000us         0.00%       0.000us       0.000us      40.556ms       109.47%      40.556ms      40.556ms             1  
+                                        gpt_oss_experts         4.33%       1.794ms        99.85%      41.353ms      41.353ms       0.000us         0.00%      37.080ms      37.080ms             1  
+                                           aten::matmul         0.06%      24.371us         1.08%     445.903us      37.159us       0.000us         0.00%      27.082ms       2.257ms            12  
+                                               aten::mm         0.70%     291.738us         1.02%     421.532us      35.128us      27.082ms        73.10%      27.082ms       2.257ms            12  
+void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_...         0.00%       0.000us         0.00%       0.000us       0.000us      27.079ms        73.09%      27.079ms       2.257ms            12  
+                                              aten::mul         0.38%     159.199us         0.65%     268.178us      11.174us       2.983ms         8.05%       2.983ms     124.287us            24  
+                                              aten::add         0.48%     198.424us         1.09%     451.763us      25.098us       2.404ms         6.49%       2.404ms     133.559us            18  
+                                            aten::clamp         0.27%     112.290us         0.46%     189.433us      15.786us       2.392ms         6.46%       2.392ms     199.373us            12  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       2.392ms         6.46%       2.392ms     199.373us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       1.988ms         5.37%       1.988ms     165.669us            12  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       1.629ms         4.40%       1.629ms     135.763us            12  
+                                       aten::index_add_         0.12%      50.103us         0.20%      84.453us      14.076us     899.456us         2.43%     899.456us     149.909us             6  
+void at::native::indexFuncLargeIndex<float, long, un...         0.00%       0.000us         0.00%       0.000us       0.000us     899.456us         2.43%     899.456us     149.909us             6  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     774.912us         2.09%     774.912us     129.152us             6  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     733.217us         1.98%     733.217us     122.203us             6  
+                                            aten::index         0.45%     187.302us         0.77%     318.787us      26.566us     712.767us         1.92%     712.767us      59.397us            12  
+void at::native::vectorized_gather_kernel<16, long>(...         0.00%       0.000us         0.00%       0.000us       0.000us     678.496us         1.83%     678.496us     113.083us             6  
+                                          aten::sigmoid         0.09%      36.082us         0.15%      63.023us      10.504us     323.008us         0.87%     323.008us      53.835us             6  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     323.008us         0.87%     323.008us      53.835us             6  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     261.631us         0.71%     261.631us      43.605us             6  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 41.415ms
+Self CUDA time total: 37.046ms
+
+
+
+======================================================================
+PROFILE TRACE: gpt_oss_experts | cuda_B4_S1024_E4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                        gpt_oss_experts         0.00%       0.000us         0.00%       0.000us       0.000us      41.050ms       117.27%      41.050ms      41.050ms             1  
+                                        gpt_oss_experts         6.46%       2.709ms        99.99%      41.912ms      41.912ms       0.000us         0.00%      35.025ms      35.025ms             1  
+                                           aten::matmul         0.11%      47.590us         2.12%     888.873us      37.036us       0.000us         0.00%      29.051ms       1.210ms            24  
+                                               aten::mm         1.28%     536.727us         2.01%     841.283us      35.053us      29.051ms        82.99%      29.051ms       1.210ms            24  
+void cutlass::Kernel2<cutlass_80_simt_sgemm_256x128_...         0.00%       0.000us         0.00%       0.000us       0.000us      20.585ms        58.81%      20.585ms       1.372ms            15  
+                                 ampere_sgemm_128x64_nn         0.00%       0.000us         0.00%       0.000us       0.000us       8.453ms        24.15%       8.453ms     939.204us             9  
+                                              aten::add         0.88%     367.610us         1.45%     609.056us      16.918us       1.486ms         4.24%       1.486ms      41.264us            36  
+                                              aten::mul         0.74%     309.128us         1.24%     518.283us      10.798us       1.380ms         3.94%       1.380ms      28.757us            48  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     925.695us         2.64%     925.695us      38.571us            24  
+                                       aten::index_add_         0.24%      99.111us         0.40%     167.273us      13.939us     903.487us         2.58%     903.487us      75.291us            12  
+void at::native::indexFuncLargeIndex<float, long, un...         0.00%       0.000us         0.00%       0.000us       0.000us     903.487us         2.58%     903.487us      75.291us            12  
+                                            aten::clamp         0.51%     214.986us         0.87%     364.790us      15.200us     775.806us         2.22%     775.806us      32.325us            24  
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     775.806us         2.22%     775.806us      32.325us            24  
+                                            aten::index         0.89%     373.269us         1.50%     629.207us      26.217us     670.881us         1.92%     670.881us      27.953us            24  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     631.200us         1.80%     631.200us      52.600us            12  
+void at::native::vectorized_gather_kernel<16, long>(...         0.00%       0.000us         0.00%       0.000us       0.000us     600.224us         1.71%     600.224us      50.019us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     559.808us         1.60%     559.808us      46.651us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     540.611us         1.54%     540.611us      22.525us            24  
+                                          aten::sigmoid         0.17%      72.182us         0.29%     123.582us      10.298us     351.039us         1.00%     351.039us      29.253us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     351.039us         1.00%     351.039us      29.253us            12  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 41.917ms
+Self CUDA time total: 35.005ms
+
+
+impl                     wl                  p50(ms)  ok
+gpt_oss_experts          cuda_B1_S1024_E2       3.79  True
+gpt_oss_experts          cuda_B1_S1024_E4       5.24  True
+gpt_oss_experts          cuda_B1_S512_E2        2.63  True
+gpt_oss_experts          cuda_B1_S512_E4        3.89  True
+gpt_oss_experts          cuda_B4_S1024_E2      13.28  True
+gpt_oss_experts          cuda_B4_S1024_E4      13.19  True
+gpt_oss_experts          cuda_B4_S512_E2        6.74  True
+gpt_oss_experts          cuda_B4_S512_E4        7.36  True
+
+
+
▶ UV Install Logs
+ +
+
Fetching 6 files: 0%| | 0/6 [00:00<?, ?it/s] +Fetching 6 files: 33%|███▎ | 2/6 [00:00<00:00, 16.13it/s] +Fetching 6 files: 67%|██████▋ | 4/6 [00:00<00:00, 7.33it/s] +Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 11.97it/s]
+
+

Artifacts:

+openai_moe.jsonl +
+
+
+
+ + + \ No newline at end of file diff --git a/openai_moe/impls/index.html b/openai_moe/impls/index.html new file mode 100644 index 0000000000000000000000000000000000000000..a032fd6256daec3a2b89ade46bc2b05f2a12fbf3 --- /dev/null +++ b/openai_moe/impls/index.html @@ -0,0 +1,89 @@ + + + + + + Index of /openai_moe/impls + + + +
+ ← back +
+

Index of /openai_moe/impls

+ + + \ No newline at end of file diff --git a/openai_moe/index.html b/openai_moe/index.html new file mode 100644 index 0000000000000000000000000000000000000000..aa8352a8b25e9434df4f5e5d95c60283730bb0ee --- /dev/null +++ b/openai_moe/index.html @@ -0,0 +1,89 @@ + + + + + + Index of /openai_moe + + + +
+ ← back +
+

Index of /openai_moe

+ + + \ No newline at end of file diff --git a/openai_moe/results/artifacts/combine/latency.svg b/openai_moe/results/artifacts/combine/latency.svg new file mode 100644 index 0000000000000000000000000000000000000000..10dbc66aeb1ffe85716a2da3bc2a8a2ad4700bc3 --- /dev/null +++ b/openai_moe/results/artifacts/combine/latency.svg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6b68c91c95cfb46a71083a3812949c831a6e82a5f655eb32ed7c0b19426124d +size 21857 diff --git a/openai_moe/results/cells/combine.py b/openai_moe/results/cells/combine.py new file mode 100644 index 0000000000000000000000000000000000000000..ca2ef03a01cd0a7f3bc24a5646d108265d436bf2 --- /dev/null +++ b/openai_moe/results/cells/combine.py @@ -0,0 +1,27 @@ +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "numpy", +# "torch==2.8.0", +# "kernels-benchmark-tools", +# "matplotlib", +# ] +# +# [tool.uv.sources] +# kernels-benchmark-tools = { path = "../../../../../tools", editable = true } +# /// +from kernels_benchmark_tools.core.visuals import generate_combined_results + +# Map display names to uvnote environment variables +cache_env_map = { + # "PyTorch OpenAI MoE": "UVNOTE_FILE_TORCH_OPENAI_MOE_BENCHMARK", + "Binned PyTorch": "UVNOTE_FILE_BINNED_TORCH_BENCHMARK", + "GptOssExperts": "UVNOTE_FILE_GPT_OSS_MOE_BENCHMARK", +} + +# Generate combined results with visualization +generate_combined_results( + cache_env_map=cache_env_map, + output_filename="openai_moe.jsonl", + svg_filename="latency.svg" +) \ No newline at end of file diff --git a/openai_moe/results/combined_results.html b/openai_moe/results/combined_results.html new file mode 100644 index 0000000000000000000000000000000000000000..bedcc83cf8db7d27f0e74cfbb9d1c9ceb5663901 --- /dev/null +++ b/openai_moe/results/combined_results.html @@ -0,0 +1,4935 @@ + + + + + + OpenAI-style MoE Benchmark - Combined Results + + + + + + + + + + + + +
+
+ + ← back + +
light
+
reset
+ +
+
+ +
+
Generated on:
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35 +
+
+ +
+

OpenAI-style MoE (Mixture of Experts) Benchmarks - Aggregated Results

+

This document combines benchmark results from multiple OpenAI-style MoE implementations.

+

Combined Summary and Visualization

+
+ + + + + + + 2025-10-31T20:14:14.575906 + image/svg+xml + + + Matplotlib v3.10.7, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + cuda_B1_S512_E2 + + + + + + + + + + + + + cuda_B1_S512_E4 + + + + + + + + + + + + + cuda_B1_S1024_E2 + + + + + + + + + + + + + cuda_B1_S1024_E4 + + + + + + + + + + + + + cuda_B4_S512_E2 + + + + + + + + + + + + + cuda_B4_S512_E4 + + + + + + + + + + + + + cuda_B4_S1024_E2 + + + + + + + + + + + + + cuda_B4_S1024_E4 + + + + Workload + + + + + + + + + + + + + + + + + 0 + + + + + + + + + + + + + 200 + + + + + + + + + + + + + 400 + + + + + + + + + + + + + 600 + + + + + + + + + + + + + 800 + + + + + + + + + + + + + 1000 + + + + + + + + + + + + + 1200 + + + + + + + + + + + + + 1400 + + + + + + + + + + + + + 1600 + + + + Latency P50 (ms) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Attention Implementation Latency + + + + + + + + + + + + + binned_torch + + + + + + + + + gpt_oss_experts + + + + + + + + + + +
+ +
+
+ +▶ code +▼ output + ▶ uv-logs + | +Cell: combine | 4.26s + | + +Raw +
+ +
+
======================================================================
+LOADING BENCHMARK DATA
+======================================================================
+✓ Binned PyTorch                : /__w/kernels-benchmarks/kernels-benchmarks/benches/openai_moe/impls/.uvnote/cache/fd01907ce582015b5dd52e56081cc8e2a21813f73271b422308d60a8ab9391af
+✓ GptOssExperts                 : /__w/kernels-benchmarks/kernels-benchmarks/benches/openai_moe/impls/.uvnote/cache/002e3e7d42f2dbf6d5e5216db57e56aa649bc6ac59ce4131ce80c5849e52482b
+
+  ✓ Found Binned PyTorch
+     Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/openai_moe/impls/.uvnote/cache/fd01907ce582015b5dd52e56081cc8e2a21813f73271b422308d60a8ab9391af/openai_moe.jsonl
+  ✓ Found GptOssExperts
+     Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/openai_moe/impls/.uvnote/cache/002e3e7d42f2dbf6d5e5216db57e56aa649bc6ac59ce4131ce80c5849e52482b/openai_moe.jsonl
+
+======================================================================
+Summary: 2 found, 0 skipped, 0 missing
+======================================================================
+
+COMBINED BENCHMARK SUMMARY
+
+impl                     wl                  p50(ms)  ok
+binned_torch             cuda_B1_S1024_E2     372.79  True
+binned_torch             cuda_B1_S1024_E4     382.68  True
+binned_torch             cuda_B1_S512_E2      150.05  True
+binned_torch             cuda_B1_S512_E4      200.26  True
+binned_torch             cuda_B4_S1024_E2    1486.48  True
+binned_torch             cuda_B4_S1024_E4    1524.50  True
+binned_torch             cuda_B4_S512_E2      742.02  True
+binned_torch             cuda_B4_S512_E4      801.90  True
+gpt_oss_experts          cuda_B1_S1024_E2       3.79  True
+gpt_oss_experts          cuda_B1_S1024_E4       5.24  True
+gpt_oss_experts          cuda_B1_S512_E2        2.63  True
+gpt_oss_experts          cuda_B1_S512_E4        3.89  True
+gpt_oss_experts          cuda_B4_S1024_E2      13.28  True
+gpt_oss_experts          cuda_B4_S1024_E4      13.19  True
+gpt_oss_experts          cuda_B4_S512_E2        6.74  True
+gpt_oss_experts          cuda_B4_S512_E4        7.36  True
+
+GENERATING COMBINED VISUALIZATION
+
+Loaded 16 records
+✓ Visualization saved as latency.svg
+Saved latency.png
+✓ Visualization saved as latency.svg
+✓ SVG visualization ready!
+
+ANALYSIS COMPLETE
+Total implementations analyzed: 2
+
+Implementations included:
+  ✓ Binned PyTorch
+  ✓ GptOssExperts
+
+
+
▶ UV Install Logs
+ +
+
+

Artifacts:

+latency.svg +
+ + + + + + + 2025-10-31T20:14:14.575906 + image/svg+xml + + + Matplotlib v3.10.7, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + cuda_B1_S512_E2 + + + + + + + + + + + + + cuda_B1_S512_E4 + + + + + + + + + + + + + cuda_B1_S1024_E2 + + + + + + + + + + + + + cuda_B1_S1024_E4 + + + + + + + + + + + + + cuda_B4_S512_E2 + + + + + + + + + + + + + cuda_B4_S512_E4 + + + + + + + + + + + + + cuda_B4_S1024_E2 + + + + + + + + + + + + + cuda_B4_S1024_E4 + + + + Workload + + + + + + + + + + + + + + + + + 0 + + + + + + + + + + + + + 200 + + + + + + + + + + + + + 400 + + + + + + + + + + + + + 600 + + + + + + + + + + + + + 800 + + + + + + + + + + + + + 1000 + + + + + + + + + + + + + 1200 + + + + + + + + + + + + + 1400 + + + + + + + + + + + + + 1600 + + + + Latency P50 (ms) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Attention Implementation Latency + + + + + + + + + + + + + binned_torch + + + + + + + + + gpt_oss_experts + + + + + + + + + + +
+
+
+
+
+ + + \ No newline at end of file diff --git a/openai_moe/results/index.html b/openai_moe/results/index.html new file mode 100644 index 0000000000000000000000000000000000000000..f90c14f2561df54bf5684b44db4e9111a3233a73 --- /dev/null +++ b/openai_moe/results/index.html @@ -0,0 +1,88 @@ + + + + + + Index of /openai_moe/results + + + +
+ ← back +
+

Index of /openai_moe/results

+ + + \ No newline at end of file diff --git a/rotary/impls/artifacts/benchmark/rotary.jsonl b/rotary/impls/artifacts/benchmark/rotary.jsonl index 1c45cd7ef0b4126f1b9f9093527991af5f22312d..2f046365b897b6b0052a6d0c4d2d39bda02f57ee 100644 --- a/rotary/impls/artifacts/benchmark/rotary.jsonl +++ b/rotary/impls/artifacts/benchmark/rotary.jsonl @@ -1,24 +1,24 @@ -{"ts": "2025-10-30T15:53:24Z", "run": "2249f950b26b464fbf028ad8f9536606", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H8_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.17100299999128765, "p50": 0.1746739999930469, "p90": 0.1748229999520845, "mean": 0.17718919998515048, "iqr": 0.0008499999921696144, "raw_times": [0.1739729999599149, 0.19147300002941847, 0.17100299999128765, 0.1748229999520845, 0.1746739999930469], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.1834729999927731, "peak_bytes": 1720320, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-30T15:53:24Z", "run": "2249f950b26b464fbf028ad8f9536606", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H8_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2229240000133359, "p50": 0.22586399995816464, "p90": 0.2263739999648351, "mean": 0.2256739999893398, "iqr": 0.0011099999710495467, "raw_times": [0.2229240000133359, 0.22586399995816464, 0.2263739999648351, 0.22526399999378555, 0.22794400001657777], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22923400001673144, "peak_bytes": 3440640, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-30T15:53:24Z", "run": "2249f950b26b464fbf028ad8f9536606", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H32_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22077400001307979, "p50": 0.2269739999860576, "p90": 0.2270040000098561, "mean": 0.2274739999961639, "iqr": 0.0016900000332498166, "raw_times": [0.22531399997660628, 0.2269739999860576, 0.23730399999521978, 0.22077400001307979, 0.2270040000098561], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23222400000122434, "peak_bytes": 6832128, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-30T15:53:24Z", "run": "2249f950b26b464fbf028ad8f9536606", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H32_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21741400001928923, "p50": 0.2200139999786188, "p90": 0.22255300001461364, "mean": 0.22600780001766907, "iqr": 0.0031789999752618314, "raw_times": [0.2200139999786188, 0.21741400001928923, 0.22255300001461364, 0.2193740000393518, 0.25068400003647184], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22356400000944632, "peak_bytes": 13664256, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-30T15:53:24Z", "run": "2249f950b26b464fbf028ad8f9536606", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H8_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22311399999352943, "p50": 0.22467400003733928, "p90": 0.22555399999646397, "mean": 0.22520960002339052, "iqr": 0.0019609999526437605, "raw_times": [0.22555399999646397, 0.22467400003733928, 0.2291130000457997, 0.22311399999352943, 0.2235930000438202], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23042399999440022, "peak_bytes": 6881280, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-30T15:53:24Z", "run": "2249f950b26b464fbf028ad8f9536606", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H8_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21832400000221241, "p50": 0.21977400001560454, "p90": 0.22210299999869676, "mean": 0.22257580001223687, "iqr": 0.002768999991076271, "raw_times": [0.21832400000221241, 0.22210299999869676, 0.23334400003705014, 0.2193340000076205, 0.21977400001560454], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23037299996531146, "peak_bytes": 13762560, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-30T15:53:24Z", "run": "2249f950b26b464fbf028ad8f9536606", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H32_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21834399996123466, "p50": 0.22027399995749875, "p90": 0.22137399997745888, "mean": 0.22062599997525467, "iqr": 0.00113999999484804, "raw_times": [0.22290399999747024, 0.22027399995749875, 0.22137399997745888, 0.22023399998261084, 0.21834399996123466], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22569399999383677, "peak_bytes": 27328512, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-30T15:53:24Z", "run": "2249f950b26b464fbf028ad8f9536606", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H32_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22165399997220447, "p50": 0.22276400000009744, "p90": 0.2233839999803422, "mean": 0.22286399999984496, "iqr": 0.0009499999578110874, "raw_times": [0.2224340000225311, 0.22165399997220447, 0.2233839999803422, 0.22276400000009744, 0.2240840000240496], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2297839999982898, "peak_bytes": 54657024, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-30T15:53:24Z", "run": "2249f950b26b464fbf028ad8f9536606", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H8_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22094399997740766, "p50": 0.22392400001081114, "p90": 0.226194999981999, "mean": 0.22451620000083494, "iqr": 0.003770999967400712, "raw_times": [0.22094399997740766, 0.22392400001081114, 0.22909400001935865, 0.226194999981999, 0.22242400001459828], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22784399999409288, "peak_bytes": 27525120, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-30T15:53:24Z", "run": "2249f950b26b464fbf028ad8f9536606", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H8_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22157400002242866, "p50": 0.22411399999100468, "p90": 0.22636400001374568, "mean": 0.22692980001011165, "iqr": 0.0029010000162088545, "raw_times": [0.22346299999753683, 0.22636400001374568, 0.2391340000258424, 0.22157400002242866, 0.22411399999100468], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22614400000975365, "peak_bytes": 55050240, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-30T15:53:24Z", "run": "2249f950b26b464fbf028ad8f9536606", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H32_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22298400000408947, "p50": 0.2238440000041919, "p90": 0.225143999955435, "mean": 0.22477019999769254, "iqr": 0.0013799999578623101, "raw_times": [0.22376399999757268, 0.22298400000408947, 0.22811500002717366, 0.225143999955435, 0.2238440000041919], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22483399999373432, "peak_bytes": 109314048, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-30T15:53:24Z", "run": "2249f950b26b464fbf028ad8f9536606", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H32_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22621400000844005, "p50": 0.22858399995584477, "p90": 0.22970399999167057, "mean": 0.22850999999945998, "iqr": 0.003459999959432025, "raw_times": [0.22624400003223855, 0.22858399995584477, 0.22621400000844005, 0.23180400000910595, 0.22970399999167057], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22907400000349298, "peak_bytes": 218628096, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-30T15:53:24Z", "run": "2249f950b26b464fbf028ad8f9536606", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H8_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22101400003293747, "p50": 0.224504000016168, "p90": 0.22463400000560796, "mean": 0.22994020000624005, "iqr": 0.0010800000040944724, "raw_times": [0.22101400003293747, 0.22463400000560796, 0.2559949999749733, 0.224504000016168, 0.2235540000015135], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2261639999687759, "peak_bytes": 68698112, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-30T15:53:24Z", "run": "2249f950b26b464fbf028ad8f9536606", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H8_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22005400001035014, "p50": 0.22374300004912584, "p90": 0.22502399997392786, "mean": 0.2236157999959687, "iqr": 0.0018200000226897828, "raw_times": [0.22374300004912584, 0.2260539999952016, 0.22320399995123807, 0.22502399997392786, 0.22005400001035014], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2302039999904082, "peak_bytes": 6848512, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-30T15:53:24Z", "run": "2249f950b26b464fbf028ad8f9536606", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H32_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22128399996290682, "p50": 0.22392400001081114, "p90": 0.22476399999504793, "mean": 0.22437599998283986, "iqr": 0.0009500000146545062, "raw_times": [0.22381399998039342, 0.22809399996503998, 0.22476399999504793, 0.22392400001081114, 0.22128399996290682], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23023399995736327, "peak_bytes": 13647872, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-30T15:53:24Z", "run": "2249f950b26b464fbf028ad8f9536606", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H32_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21981399999049245, "p50": 0.22491400000035355, "p90": 0.2271139999834304, "mean": 0.2256657999851086, "iqr": 0.0057999999967250915, "raw_times": [0.2213139999867053, 0.2351729999645613, 0.21981399999049245, 0.22491400000035355, 0.2271139999834304], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22896399997307526, "peak_bytes": 27295744, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-30T15:53:24Z", "run": "2249f950b26b464fbf028ad8f9536606", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H8_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2212340000369295, "p50": 0.22300300003053053, "p90": 0.22370400000681911, "mean": 0.2228398000170273, "iqr": 0.0018700000055105193, "raw_times": [0.22300300003053053, 0.22370400000681911, 0.22442400000954876, 0.2212340000369295, 0.2218340000013086], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.24375499998541272, "peak_bytes": 13697024, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-30T15:53:24Z", "run": "2249f950b26b464fbf028ad8f9536606", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H8_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21925400000100126, "p50": 0.22213400001191985, "p90": 0.2221839999947406, "mean": 0.22427599999446102, "iqr": 0.0003700000092976552, "raw_times": [0.22181399998544293, 0.23599399997920045, 0.21925400000100126, 0.22213400001191985, 0.2221839999947406], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.255094999999983, "peak_bytes": 27394048, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-30T15:53:24Z", "run": "2249f950b26b464fbf028ad8f9536606", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H32_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22207399996432287, "p50": 0.22611399998595516, "p90": 0.22614400000975365, "mean": 0.22540399997978966, "iqr": 0.0004800000397153781, "raw_times": [0.22611399998595516, 0.22207399996432287, 0.22702399996887834, 0.22614400000975365, 0.22566399997003828], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.24224399999184243, "peak_bytes": 54591488, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-30T15:53:25Z", "run": "2249f950b26b464fbf028ad8f9536606", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H32_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21955500000103711, "p50": 0.2238039999724606, "p90": 0.22417399998175824, "mean": 0.229038399993442, "iqr": 0.0011799999697359453, "raw_times": [0.2238039999724606, 0.22417399998175824, 0.25466499999993175, 0.2229940000120223, 0.21955500000103711], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23440400002527895, "peak_bytes": 109182976, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-30T15:53:25Z", "run": "2249f950b26b464fbf028ad8f9536606", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H8_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2215840000303615, "p50": 0.22219400000267342, "p90": 0.22536399995942702, "mean": 0.22351999999727923, "iqr": 0.0037099999303791265, "raw_times": [0.2215840000303615, 0.22219400000267342, 0.22680399996488632, 0.22536399995942702, 0.2216540000290479], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22727300000724426, "peak_bytes": 54788096, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-30T15:53:25Z", "run": "2249f950b26b464fbf028ad8f9536606", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H8_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22125399999595174, "p50": 0.22540399999115834, "p90": 0.23793399998339737, "mean": 0.23205199998983517, "iqr": 0.012690000005477486, "raw_times": [0.2504240000007485, 0.22524399997791988, 0.22125399999595174, 0.23793399998339737, 0.22540399999115834], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22909400001935865, "peak_bytes": 109576192, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-30T15:53:25Z", "run": "2249f950b26b464fbf028ad8f9536606", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H32_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2288640000074338, "p50": 0.2294729999903211, "p90": 0.22959400001809627, "mean": 0.2293698000016775, "iqr": 0.0005100000066704524, "raw_times": [0.22959400001809627, 0.2294729999903211, 0.22983399998111054, 0.2288640000074338, 0.22908400001142581], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22990399997979694, "peak_bytes": 218365952, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} -{"ts": "2025-10-30T15:53:25Z", "run": "2249f950b26b464fbf028ad8f9536606", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H32_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.6347010000240516, "p50": 0.6367309999859572, "p90": 0.6407210000247687, "mean": 0.6405370000038602, "iqr": 0.004560000036235579, "raw_times": [0.6543709999959901, 0.6347010000240516, 0.6407210000247687, 0.6367309999859572, 0.6361609999885331], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.6394609999915701, "peak_bytes": 436731904, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-31T20:01:22Z", "run": "eaf2d47fcdc24840a68457c07da24ae9", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H8_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0773009999761598, "p50": 0.07878200000277502, "p90": 0.07927199999357981, "mean": 0.08125379999910365, "iqr": 0.0008899999670575198, "raw_times": [0.07927199999357981, 0.07878200000277502, 0.09253199999648132, 0.07838200002652229, 0.0773009999761598], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08711200001698671, "peak_bytes": 3178496, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.590452924915553e-08, "mae_k": 1.5487040982975486e-08, "mse_q": 2.5241010080938753e-15, "mse_k": 2.364223539299626e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-31T20:01:22Z", "run": "eaf2d47fcdc24840a68457c07da24ae9", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H8_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09208300002683245, "p50": 0.09279300002162927, "p90": 0.09387199997945572, "mean": 0.09325840001110919, "iqr": 0.0014699999724143709, "raw_times": [0.09208300002683245, 0.09240200000704135, 0.09387199997945572, 0.09514200002058715, 0.09279300002162927], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0956929999915701, "peak_bytes": 6356992, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5508486939097565e-08, "mae_k": 1.567566698668088e-08, "mse_q": 2.3630110116356316e-15, "mse_k": 2.416562128626943e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-31T20:01:22Z", "run": "eaf2d47fcdc24840a68457c07da24ae9", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H32_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09122299996988659, "p50": 0.09174199999506527, "p90": 0.09311200000183817, "mean": 0.09907239998483419, "iqr": 0.0014700000292577897, "raw_times": [0.12764299998480055, 0.09311200000183817, 0.09174199999506527, 0.09164199997258038, 0.09122299996988659], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09512200000472149, "peak_bytes": 12615680, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5856898016863852e-08, "mae_k": 1.572981211950264e-08, "mse_q": 2.4771055025978386e-15, "mse_k": 2.4544071371937915e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-31T20:01:22Z", "run": "eaf2d47fcdc24840a68457c07da24ae9", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H32_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09014300002263553, "p50": 0.09057199997641874, "p90": 0.09099299995796173, "mean": 0.09084659998279676, "iqr": 0.0004309999894758221, "raw_times": [0.09014300002263553, 0.0905619999684859, 0.09099299995796173, 0.09057199997641874, 0.0919629999884819], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09270300000707721, "peak_bytes": 25231360, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5617658277733426e-08, "mae_k": 1.5788685914230882e-08, "mse_q": 2.4549424620164562e-15, "mse_k": 2.492823469483563e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-31T20:01:22Z", "run": "eaf2d47fcdc24840a68457c07da24ae9", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H8_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09128200002805897, "p50": 0.09358200003362072, "p90": 0.09361200000057579, "mean": 0.0932360000092558, "iqr": 5.9999990753567545e-05, "raw_times": [0.09358200003362072, 0.09415199997420132, 0.09361200000057579, 0.09128200002805897, 0.09355200000982222], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09915200001842095, "peak_bytes": 12779520, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5962712041073246e-08, "mae_k": 1.5743363945830424e-08, "mse_q": 2.534145124782417e-15, "mse_k": 2.451281585618423e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-31T20:01:22Z", "run": "eaf2d47fcdc24840a68457c07da24ae9", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H8_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09190200000830373, "p50": 0.09338199998865093, "p90": 0.09447299999010283, "mean": 0.09361019999687414, "iqr": 0.0011509999922054703, "raw_times": [0.09190200000830373, 0.09338199998865093, 0.09497199999941586, 0.09332199999789736, 0.09447299999010283], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09578299994927875, "peak_bytes": 25427968, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.578730035589615e-08, "mae_k": 1.5859711766097462e-08, "mse_q": 2.440287521479536e-15, "mse_k": 2.477901290051784e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-31T20:01:22Z", "run": "eaf2d47fcdc24840a68457c07da24ae9", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H32_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09281299998065151, "p50": 0.09429199997157411, "p90": 0.09554199999683988, "mean": 0.0945923999893239, "iqr": 0.0018490000002202578, "raw_times": [0.09662200000093435, 0.09281299998065151, 0.09369299999661962, 0.09554199999683988, 0.09429199997157411], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09821199995485586, "peak_bytes": 50462720, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5775295736375483e-08, "mae_k": 1.5847881229547056e-08, "mse_q": 2.471039476146077e-15, "mse_k": 2.472378635235686e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-31T20:01:22Z", "run": "eaf2d47fcdc24840a68457c07da24ae9", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H32_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09230199998455646, "p50": 0.09352199998602373, "p90": 0.09397200000194061, "mean": 0.09366439998075293, "iqr": 0.00047900005029077874, "raw_times": [0.09230199998455646, 0.09503299997959402, 0.09349299995164984, 0.09352199998602373, 0.09397200000194061], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09802200003150574, "peak_bytes": 100925440, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5959869870130206e-08, "mae_k": 1.588083975434529e-08, "mse_q": 2.510663677418633e-15, "mse_k": 2.502786271009168e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-31T20:01:22Z", "run": "eaf2d47fcdc24840a68457c07da24ae9", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H8_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0920319999977437, "p50": 0.0931920000084574, "p90": 0.09354200000188939, "mean": 0.09315399998968132, "iqr": 0.0011200000358257967, "raw_times": [0.0931920000084574, 0.0920319999977437, 0.0924219999660636, 0.09458199997425254, 0.09354200000188939], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09710300003007433, "peak_bytes": 51118080, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5894533689220225e-08, "mae_k": 1.5873395042831362e-08, "mse_q": 2.5093181655819197e-15, "mse_k": 2.488611809911578e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-31T20:01:22Z", "run": "eaf2d47fcdc24840a68457c07da24ae9", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H8_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09380299997019392, "p50": 0.0960819999704654, "p90": 0.10296200002812839, "mean": 0.0988743999982944, "iqr": 0.00756899999032612, "raw_times": [0.10613199998488199, 0.09539300003780227, 0.09380299997019392, 0.10296200002812839, 0.0960819999704654], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0969220000115456, "peak_bytes": 101711872, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5936768349433805e-08, "mae_k": 1.5960043953100467e-08, "mse_q": 2.51039008577667e-15, "mse_k": 2.5111253103748867e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-31T20:01:22Z", "run": "eaf2d47fcdc24840a68457c07da24ae9", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H32_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09422199997288772, "p50": 0.0958319999995183, "p90": 0.09810200003812497, "mean": 0.09699820000150794, "iqr": 0.0028600000518963498, "raw_times": [0.0958319999995183, 0.09524199998622862, 0.09422199997288772, 0.09810200003812497, 0.10159300001078009], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09825199998658718, "peak_bytes": 201850880, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 1.9073486328125e-06, "absmax_k": 9.5367431640625e-07, "mae_q": 1.586510300910504e-08, "mae_k": 1.5813935050346117e-08, "mse_q": 2.499836478770355e-15, "mse_k": 2.4755639026338358e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-31T20:01:22Z", "run": "eaf2d47fcdc24840a68457c07da24ae9", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H32_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.26106699999672855, "p50": 0.2625369999691429, "p90": 0.266995999993469, "mean": 0.2640226000039547, "iqr": 0.0046789999714746955, "raw_times": [0.2625369999691429, 0.2671960000384388, 0.2623170000219943, 0.266995999993469, 0.26106699999672855], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.26397600004202104, "peak_bytes": 403701760, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.581049247079136e-08, "mae_k": 1.5861061797295406e-08, "mse_q": 2.4735094242202705e-15, "mse_k": 2.486832828964107e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-31T20:01:22Z", "run": "eaf2d47fcdc24840a68457c07da24ae9", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H8_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09289299998727074, "p50": 0.09412200000724624, "p90": 0.0941720000469104, "mean": 0.09422220001624737, "iqr": 0.0009999999974752427, "raw_times": [0.09412200000724624, 0.09317200004943516, 0.0941720000469104, 0.09289299998727074, 0.09675199999037432], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09680300001946307, "peak_bytes": 137396224, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5824980437173508e-08, "mae_k": 1.5615324144846454e-08, "mse_q": 2.488090249374306e-15, "mse_k": 2.425079044911585e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-31T20:01:22Z", "run": "eaf2d47fcdc24840a68457c07da24ae9", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H8_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09332299998732196, "p50": 0.09457300001258773, "p90": 0.10429200000316996, "mean": 0.09875060001149905, "iqr": 0.009959999999864522, "raw_times": [0.10723300005111014, 0.09332299998732196, 0.09457300001258773, 0.10429200000316996, 0.09433200000330544], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 0.0961519999691518, "peak_bytes": 12648448, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5683587761827766e-08, "mae_k": 1.574532682013796e-08, "mse_q": 2.4310271220254415e-15, "mse_k": 2.4601385856313877e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-31T20:01:22Z", "run": "eaf2d47fcdc24840a68457c07da24ae9", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H32_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0932629999965684, "p50": 0.09406200001649268, "p90": 0.09426200000461904, "mean": 0.09393640000325831, "iqr": 0.0008400000410802022, "raw_times": [0.0932629999965684, 0.09406200001649268, 0.09342199996353884, 0.09467300003507262, 0.09426200000461904], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09624299997312846, "peak_bytes": 25198592, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5835009747888762e-08, "mae_k": 1.572560215379326e-08, "mse_q": 2.478222950813504e-15, "mse_k": 2.4541699679685603e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-31T20:01:22Z", "run": "eaf2d47fcdc24840a68457c07da24ae9", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H32_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09137300003203563, "p50": 0.09416199998213415, "p90": 0.09422200002973113, "mean": 0.09578819999660482, "iqr": 0.00042000004896181053, "raw_times": [0.09422200002973113, 0.09137300003203563, 0.09416199998213415, 0.10538199995835384, 0.09380199998076932], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09458300002052056, "peak_bytes": 50397184, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5876850056883995e-08, "mae_k": 1.5927410501603845e-08, "mse_q": 2.504224532953606e-15, "mse_k": 2.503892919554756e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-31T20:01:22Z", "run": "eaf2d47fcdc24840a68457c07da24ae9", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H8_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09282199999915974, "p50": 0.09416199998213415, "p90": 0.09431199998743978, "mean": 0.09398199999850476, "iqr": 0.00039999997625272954, "raw_times": [0.09416199998213415, 0.0947020000126031, 0.09431199998743978, 0.09282199999915974, 0.09391200001118705], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09807300000375108, "peak_bytes": 25362432, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5820052823301012e-08, "mae_k": 1.580205122309053e-08, "mse_q": 2.4876468276264184e-15, "mse_k": 2.4866062476507165e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-31T20:01:22Z", "run": "eaf2d47fcdc24840a68457c07da24ae9", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H8_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09392299995170106, "p50": 0.09451299996499074, "p90": 0.09455299999672206, "mean": 0.09461079997663546, "iqr": 0.00017000002117129043, "raw_times": [0.09392299995170106, 0.09568199999421267, 0.09451299996499074, 0.09438299997555077, 0.09455299999672206], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09652299996787406, "peak_bytes": 50593792, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5823172105911e-08, "mae_k": 1.582038855474366e-08, "mse_q": 2.464257071579175e-15, "mse_k": 2.4775099608301526e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-31T20:01:22Z", "run": "eaf2d47fcdc24840a68457c07da24ae9", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H32_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0935829999662019, "p50": 0.0949919999584381, "p90": 0.09520300000076531, "mean": 0.09494659997244526, "iqr": 0.0008610000463704637, "raw_times": [0.09434199995439485, 0.09661299998242612, 0.0935829999662019, 0.09520300000076531, 0.0949919999584381], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09693200001947844, "peak_bytes": 100794368, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5888783622131086e-08, "mae_k": 1.5861886026868888e-08, "mse_q": 2.4766798685418433e-15, "mse_k": 2.475923891636419e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-31T20:01:22Z", "run": "eaf2d47fcdc24840a68457c07da24ae9", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H32_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09247200000572775, "p50": 0.09415199997420132, "p90": 0.09440299999141644, "mean": 0.09443839999221382, "iqr": 0.001340999972399004, "raw_times": [0.09440299999141644, 0.09415199997420132, 0.09810299997070615, 0.09306200001901743, 0.09247200000572775], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09771300000238625, "peak_bytes": 201588736, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5826390864503992e-08, "mae_k": 1.5792682717119533e-08, "mse_q": 2.480465258783123e-15, "mse_k": 2.475580631534544e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-31T20:01:22Z", "run": "eaf2d47fcdc24840a68457c07da24ae9", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H8_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09252199998854849, "p50": 0.093122000009771, "p90": 0.09490200000072946, "mean": 0.09375020000561562, "iqr": 0.0023690000148235413, "raw_times": [0.093122000009771, 0.09252199998854849, 0.09490200000072946, 0.09567200004312326, 0.09253299998590592], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09691200000361277, "peak_bytes": 101449728, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.592899323554775e-08, "mae_k": 1.5925031959795888e-08, "mse_q": 2.50783882253954e-15, "mse_k": 2.5015648494992274e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-31T20:01:22Z", "run": "eaf2d47fcdc24840a68457c07da24ae9", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H8_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09136299996725938, "p50": 0.09425199999668621, "p90": 0.0960129999612036, "mean": 0.0991567999903964, "iqr": 0.002309999956651154, "raw_times": [0.09370300000455245, 0.09136299996725938, 0.12045300002228032, 0.09425199999668621, 0.0960129999612036], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09500200002321435, "peak_bytes": 202375168, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.590209919299923e-08, "mae_k": 1.590130160877834e-08, "mse_q": 2.4971026799330918e-15, "mse_k": 2.506967649153289e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-31T20:01:22Z", "run": "eaf2d47fcdc24840a68457c07da24ae9", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H32_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.26061699998081167, "p50": 0.26556599999594255, "p90": 0.26563699998405355, "mean": 0.2649027999950704, "iqr": 0.001249999968422344, "raw_times": [0.26061699998081167, 0.26830699999891294, 0.2643870000156312, 0.26563699998405355, 0.26556599999594255], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.26123600002847525, "peak_bytes": 403177472, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5847520629108658e-08, "mae_k": 1.5862454461057496e-08, "mse_q": 2.4917348203881045e-15, "mse_k": 2.491306009958557e-15, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-31T20:01:22Z", "run": "eaf2d47fcdc24840a68457c07da24ae9", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H32_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8460020000029544, "p50": 0.8488419999821417, "p90": 0.8517510000274342, "mean": 0.8514335999961986, "iqr": 0.004409000041505351, "raw_times": [0.8632309999825338, 0.8488419999821417, 0.8517510000274342, 0.8473419999859289, 0.8460020000029544], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8540019999827564, "peak_bytes": 806354944, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.585225106737198e-08, "mae_k": 1.581303976649906e-08, "mse_q": 2.4866460581992374e-15, "mse_k": 2.4721545950211372e-15, "ref": "rotary_torch"}, "err": null} diff --git a/rotary/impls/cells/benchmark.py b/rotary/impls/cells/benchmark.py index 94d42ad7f4a476fdf06a84f3b75776b234ecb848..7f6fcb6c184c6611acf24218eb91d13889eaa08e 100644 --- a/rotary/impls/cells/benchmark.py +++ b/rotary/impls/cells/benchmark.py @@ -4,6 +4,7 @@ # "numpy", # "torch==2.8.0", # "kernels-benchmark-tools", +# "kernels", # ] # # [tool.uv.sources] @@ -12,46 +13,36 @@ import torch import sys from kernels_benchmark_tools import KernelTypeEnum, run_benchmark +from kernels import get_kernel +# Load the rotary kernel +rotary = get_kernel("kernels-community/rotary") -def apply_rotary_torch(x1, x2, cos, sin, conj=False): - """Reference rotary implementation.""" - if not conj: - out1 = x1 * cos - x2 * sin - out2 = x1 * sin + x2 * cos - else: - out1 = x1 * cos + x2 * sin - out2 = -x1 * sin + x2 * cos - return out1, out2 - -def torch_rotary(query, key, cos, sin, conj=False): +def hf_kernels_rotary(query, key, cos, sin, conj=False): rotary_dim = cos.shape[-1] - # Clone inputs to avoid modifying them + # Clone to avoid modifying inputs q_out = query.clone() k_out = key.clone() # Apply rotation to query q1 = q_out[..., :rotary_dim] q2 = q_out[..., rotary_dim : 2 * rotary_dim] - q_out_1, q_out_2 = apply_rotary_torch(q1, q2, cos, sin, conj) - q_out[..., :rotary_dim] = q_out_1 - q_out[..., rotary_dim : 2 * rotary_dim] = q_out_2 + rotary.apply_rotary(q1, q2, cos, sin, q1, q2, conj) # Apply rotation to key k1 = k_out[..., :rotary_dim] k2 = k_out[..., rotary_dim : 2 * rotary_dim] - k_out_1, k_out_2 = apply_rotary_torch(k1, k2, cos, sin, conj) - k_out[..., :rotary_dim] = k_out_1 - k_out[..., rotary_dim : 2 * rotary_dim] = k_out_2 + rotary.apply_rotary(k1, k2, cos, sin, k1, k2, conj) return q_out, k_out run_benchmark( kernel_type=KernelTypeEnum.ROTARY, - impl_name="torch_eager", - impl_tags={"family": "pytorch", "backend": "eager"}, - impl_func=torch_rotary, + impl_name="hf_kernels_rotary", + impl_tags={"family": "hf-kernels", "backend": "cuda"}, + impl_func=hf_kernels_rotary, + dtype="float32", ) \ No newline at end of file diff --git a/rotary/impls/hf_kernels_rotary.html b/rotary/impls/hf_kernels_rotary.html index 727c47ed5f26fae785282a5942ab8c0a053fcc96..330944cc91943bb2b53930714f4af06cb0ca72b7 100644 --- a/rotary/impls/hf_kernels_rotary.html +++ b/rotary/impls/hf_kernels_rotary.html @@ -4106,11 +4106,12 @@ body[data-tool="eraser"] .main-content { ▼ output ▶ uv-logs | -Cell: nv | 0.21s +Cell: nv | 0.23s | Raw -GitHub +GitHub +🤗 HF
@@ -4122,7 +4123,7 @@ Cell: nv | 0.21s
-
Thu Oct 30 15:52:23 2025       
+
Fri Oct 31 20:00:00 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -4131,7 +4132,7 @@ Cell: nv | 0.21s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   30C    P0             76W /  350W |       0MiB /  46068MiB |     11%      Default |
+| N/A   32C    P0            101W /  350W |       0MiB /  46068MiB |    100%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -4155,11 +4156,12 @@ Cell: nv | 0.21s
 ▼ output
  ▶ uv-logs
  | 
-Cell: benchmark | 8.39s
+Cell: benchmark | 4.67s
  | 
 
 Raw
-GitHub
+GitHub
+🤗 HF
 
@@ -4225,23 +4227,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 437.951us 1890.33% 437.951us 437.951us 1 - hf_kernels_rotary 12.22% 256.435us 99.67% 2.092ms 2.092ms 0.000us 0.00% 24.448us 24.448us 1 - _rotary_dba7d1e::apply_rotary 2.70% 56.773us 5.22% 109.533us 18.255us 16.128us 69.61% 16.128us 2.688us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.128us 69.61% 16.128us 2.688us 6 - aten::clone 2.06% 43.312us 79.20% 1.663ms 277.110us 0.000us 0.00% 8.320us 1.387us 6 - aten::copy_ 2.16% 45.349us 74.16% 1.557ms 259.469us 7.040us 30.39% 8.320us 1.387us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.040us 30.39% 7.040us 1.173us 6 - Activity Buffer Request 68.35% 1.435ms 68.35% 1.435ms 1.435ms 1.280us 5.52% 1.280us 1.280us 1 - aten::empty_strided 2.98% 62.532us 2.98% 62.532us 10.422us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 3.65% 76.672us 3.65% 76.672us 12.779us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.33% 48.990us 3.04% 63.719us 5.310us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.70% 14.729us 0.70% 14.729us 1.227us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.51% 52.760us 2.51% 52.760us 8.793us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.33% 6.840us 0.33% 6.840us 6.840us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 426.303us 1837.51% 426.303us 426.303us 1 + hf_kernels_rotary 12.40% 260.056us 99.66% 2.090ms 2.090ms 0.000us 0.00% 24.480us 24.480us 1 + _rotary_dba7d1e::apply_rotary 2.75% 57.674us 5.07% 106.315us 17.719us 16.128us 69.52% 16.128us 2.688us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.128us 69.52% 16.128us 2.688us 6 + aten::clone 2.13% 44.582us 79.34% 1.664ms 277.309us 0.000us 0.00% 8.352us 1.392us 6 + aten::copy_ 1.84% 38.562us 74.44% 1.561ms 260.165us 7.072us 30.48% 8.352us 1.392us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.072us 30.48% 7.072us 1.179us 6 + Activity Buffer Request 69.01% 1.447ms 69.01% 1.447ms 1.447ms 1.280us 5.52% 1.280us 1.280us 1 + aten::empty_strided 2.78% 58.281us 2.78% 58.281us 9.713us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 3.58% 75.121us 3.58% 75.121us 12.520us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.14% 44.780us 2.85% 59.790us 4.983us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.72% 15.010us 0.72% 15.010us 1.251us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 2.32% 48.641us 2.32% 48.641us 8.107us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.34% 7.100us 0.34% 7.100us 7.100us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.099ms -Self CUDA time total: 23.168us +Self CPU time total: 2.097ms +Self CUDA time total: 23.200us @@ -4251,23 +4253,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 347.903us 1449.48% 347.903us 347.903us 1 - hf_kernels_rotary 8.54% 161.773us 99.74% 1.890ms 1.890ms 0.000us 0.00% 25.314us 25.314us 1 - _rotary_dba7d1e::apply_rotary 2.18% 41.260us 4.61% 87.431us 14.572us 16.194us 67.47% 16.194us 2.699us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.194us 67.47% 16.194us 2.699us 6 - aten::clone 1.21% 22.941us 84.30% 1.597ms 266.206us 0.000us 0.00% 9.120us 1.520us 6 - aten::copy_ 2.05% 38.809us 81.33% 1.541ms 256.844us 7.808us 32.53% 9.120us 1.520us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.808us 32.53% 7.808us 1.301us 6 - Activity Buffer Request 76.43% 1.448ms 76.43% 1.448ms 1.448ms 1.312us 5.47% 1.312us 1.312us 1 - aten::empty_strided 1.75% 33.230us 1.75% 33.230us 5.538us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 2.85% 54.092us 2.85% 54.092us 9.015us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.79% 33.972us 2.29% 43.382us 3.615us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.50% 9.410us 0.50% 9.410us 0.784us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.44% 46.171us 2.44% 46.171us 7.695us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.26% 4.990us 0.26% 4.990us 4.990us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 340.796us 1422.00% 340.796us 340.796us 1 + hf_kernels_rotary 9.48% 182.026us 99.73% 1.916ms 1.916ms 0.000us 0.00% 25.278us 25.278us 1 + _rotary_dba7d1e::apply_rotary 2.22% 42.701us 4.40% 84.531us 14.088us 16.159us 67.42% 16.159us 2.693us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.159us 67.42% 16.159us 2.693us 6 + aten::clone 1.41% 27.120us 83.58% 1.605ms 267.570us 0.000us 0.00% 9.119us 1.520us 6 + aten::copy_ 2.02% 38.773us 80.45% 1.545ms 257.555us 7.807us 32.58% 9.119us 1.520us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.807us 32.58% 7.807us 1.301us 6 + Activity Buffer Request 75.56% 1.451ms 75.56% 1.451ms 1.451ms 1.312us 5.47% 1.312us 1.312us 1 + aten::empty_strided 1.72% 32.970us 1.72% 32.970us 5.495us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 2.88% 55.291us 2.88% 55.291us 9.215us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.76% 33.749us 2.27% 43.642us 3.637us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.52% 9.893us 0.52% 9.893us 0.824us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 2.18% 41.830us 2.18% 41.830us 6.972us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.27% 5.161us 0.27% 5.161us 5.161us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.895ms -Self CUDA time total: 24.002us +Self CPU time total: 1.921ms +Self CUDA time total: 23.966us @@ -4277,23 +4279,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 344.799us 1421.56% 344.799us 344.799us 1 - hf_kernels_rotary 8.36% 157.652us 99.72% 1.880ms 1.880ms 0.000us 0.00% 25.535us 25.535us 1 - _rotary_dba7d1e::apply_rotary 2.20% 41.393us 4.58% 86.433us 14.405us 16.479us 67.94% 16.479us 2.747us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.479us 67.94% 16.479us 2.747us 6 - aten::clone 1.19% 22.449us 84.54% 1.594ms 265.688us 0.000us 0.00% 9.056us 1.509us 6 - aten::copy_ 1.98% 37.391us 81.51% 1.537ms 256.168us 7.776us 32.06% 9.056us 1.509us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.776us 32.06% 7.776us 1.296us 6 - Activity Buffer Request 76.55% 1.443ms 76.55% 1.443ms 1.443ms 1.280us 5.28% 1.280us 1.280us 1 - aten::empty_strided 1.84% 34.673us 1.84% 34.673us 5.779us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 2.98% 56.200us 2.98% 56.200us 9.367us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.75% 32.991us 2.23% 42.120us 3.510us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.48% 9.129us 0.48% 9.129us 0.761us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.39% 45.040us 2.39% 45.040us 7.507us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.28% 5.250us 0.28% 5.250us 5.250us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 339.421us 1391.81% 339.421us 339.421us 1 + hf_kernels_rotary 9.18% 172.926us 99.76% 1.879ms 1.879ms 0.000us 0.00% 25.699us 25.699us 1 + _rotary_dba7d1e::apply_rotary 2.20% 41.409us 4.51% 85.000us 14.167us 16.481us 67.58% 16.481us 2.747us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.481us 67.58% 16.481us 2.747us 6 + aten::clone 1.46% 27.581us 83.73% 1.577ms 262.862us 0.000us 0.00% 9.218us 1.536us 6 + aten::copy_ 1.97% 37.091us 80.45% 1.515ms 252.563us 7.906us 32.42% 9.218us 1.536us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.906us 32.42% 7.906us 1.318us 6 + Activity Buffer Request 75.71% 1.426ms 75.71% 1.426ms 1.426ms 1.312us 5.38% 1.312us 1.312us 1 + aten::empty_strided 1.82% 34.210us 1.82% 34.210us 5.702us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 2.77% 52.231us 2.77% 52.231us 8.705us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.80% 33.892us 2.33% 43.952us 3.663us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.53% 10.060us 0.53% 10.060us 0.838us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 2.31% 43.591us 2.31% 43.591us 7.265us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.24% 4.550us 0.24% 4.550us 4.550us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.886ms -Self CUDA time total: 24.255us +Self CPU time total: 1.884ms +Self CUDA time total: 24.387us @@ -4303,23 +4305,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 344.221us 1225.16% 344.221us 344.221us 1 - hf_kernels_rotary 7.87% 162.633us 99.75% 2.060ms 2.060ms 0.000us 0.00% 29.824us 29.824us 1 - _rotary_dba7d1e::apply_rotary 1.96% 40.432us 4.15% 85.752us 14.292us 17.728us 63.10% 17.728us 2.955us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 17.728us 63.10% 17.728us 2.955us 6 - aten::clone 1.05% 21.772us 85.59% 1.768ms 294.674us 0.000us 0.00% 12.096us 2.016us 6 - aten::copy_ 1.75% 36.131us 82.94% 1.713ms 285.533us 10.368us 36.90% 12.096us 2.016us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.368us 36.90% 10.368us 1.728us 6 - Activity Buffer Request 69.12% 1.428ms 69.12% 1.428ms 1.428ms 1.728us 6.15% 1.728us 1.728us 1 - aten::empty_strided 1.60% 33.071us 1.60% 33.071us 5.512us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 12.07% 249.233us 12.07% 249.233us 41.539us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.63% 33.600us 2.13% 43.960us 3.663us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.50% 10.360us 0.50% 10.360us 0.863us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.19% 45.320us 2.19% 45.320us 7.553us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.25% 5.220us 0.25% 5.220us 5.220us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 353.466us 1252.36% 353.466us 353.466us 1 + hf_kernels_rotary 8.35% 176.747us 99.76% 2.111ms 2.111ms 0.000us 0.00% 30.048us 30.048us 1 + _rotary_dba7d1e::apply_rotary 2.17% 45.850us 4.21% 89.000us 14.833us 17.664us 62.59% 17.664us 2.944us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 17.664us 62.59% 17.664us 2.944us 6 + aten::clone 1.36% 28.714us 85.13% 1.802ms 300.274us 0.000us 0.00% 12.384us 2.064us 6 + aten::copy_ 1.83% 38.751us 82.20% 1.740ms 289.944us 10.560us 37.41% 12.384us 2.064us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.560us 37.41% 10.560us 1.760us 6 + Activity Buffer Request 67.60% 1.431ms 67.60% 1.431ms 1.431ms 1.824us 6.46% 1.824us 1.824us 1 + aten::empty_strided 1.57% 33.269us 1.57% 33.269us 5.545us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 12.77% 270.306us 12.77% 270.306us 45.051us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.59% 33.568us 2.07% 43.911us 3.659us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.49% 10.343us 0.49% 10.343us 0.862us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 2.04% 43.150us 2.04% 43.150us 7.192us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.24% 5.130us 0.24% 5.130us 5.130us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.066ms -Self CUDA time total: 28.096us +Self CPU time total: 2.116ms +Self CUDA time total: 28.224us @@ -4329,23 +4331,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 345.758us 1419.83% 345.758us 345.758us 1 - hf_kernels_rotary 7.72% 159.843us 99.76% 2.064ms 2.064ms 0.000us 0.00% 25.664us 25.664us 1 - _rotary_dba7d1e::apply_rotary 1.98% 40.892us 4.09% 84.633us 14.106us 16.544us 67.94% 16.544us 2.757us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.544us 67.94% 16.544us 2.757us 6 - aten::clone 1.14% 23.531us 85.80% 1.775ms 295.882us 0.000us 0.00% 9.120us 1.520us 6 - aten::copy_ 1.76% 36.431us 83.03% 1.718ms 286.337us 7.808us 32.06% 9.120us 1.520us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.808us 32.06% 7.808us 1.301us 6 - Activity Buffer Request 69.77% 1.444ms 69.77% 1.444ms 1.444ms 1.312us 5.39% 1.312us 1.312us 1 - aten::empty_strided 1.63% 33.740us 1.63% 33.740us 5.623us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 11.50% 237.923us 11.50% 237.923us 39.654us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.68% 34.750us 2.15% 44.540us 3.712us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.47% 9.790us 0.47% 9.790us 0.816us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.11% 43.741us 2.11% 43.741us 7.290us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.24% 4.890us 0.24% 4.890us 4.890us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 351.740us 1444.46% 351.740us 351.740us 1 + hf_kernels_rotary 8.68% 176.155us 99.77% 2.024ms 2.024ms 0.000us 0.00% 25.663us 25.663us 1 + _rotary_dba7d1e::apply_rotary 2.27% 46.099us 4.32% 87.680us 14.613us 16.479us 67.67% 16.479us 2.747us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.479us 67.67% 16.479us 2.747us 6 + aten::clone 1.42% 28.832us 84.62% 1.717ms 286.091us 0.000us 0.00% 9.184us 1.531us 6 + aten::copy_ 1.86% 37.831us 81.49% 1.653ms 275.519us 7.872us 32.33% 9.184us 1.531us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.872us 32.33% 7.872us 1.312us 6 + Activity Buffer Request 70.03% 1.420ms 70.03% 1.420ms 1.420ms 1.312us 5.39% 1.312us 1.312us 1 + aten::empty_strided 1.71% 34.601us 1.71% 34.601us 5.767us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 9.60% 194.784us 9.60% 194.784us 32.464us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.63% 33.102us 2.14% 43.512us 3.626us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.51% 10.410us 0.51% 10.410us 0.867us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 2.05% 41.581us 2.05% 41.581us 6.930us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.23% 4.660us 0.23% 4.660us 4.660us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.069ms -Self CUDA time total: 24.352us +Self CPU time total: 2.029ms +Self CUDA time total: 24.351us @@ -4355,23 +4357,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 375.259us 1340.31% 375.259us 375.259us 1 - hf_kernels_rotary 7.92% 165.422us 99.76% 2.085ms 2.085ms 0.000us 0.00% 29.790us 29.790us 1 - _rotary_dba7d1e::apply_rotary 2.01% 42.019us 4.24% 88.630us 14.772us 17.566us 62.74% 17.566us 2.928us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 17.566us 62.74% 17.566us 2.928us 6 - aten::clone 1.13% 23.560us 85.51% 1.787ms 297.810us 0.000us 0.00% 12.224us 2.037us 6 - aten::copy_ 1.86% 38.872us 82.84% 1.731ms 288.508us 10.432us 37.26% 12.224us 2.037us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.432us 37.26% 10.432us 1.739us 6 - Activity Buffer Request 68.75% 1.437ms 68.75% 1.437ms 1.437ms 1.792us 6.40% 1.792us 1.792us 1 - aten::empty_strided 1.54% 32.252us 1.54% 32.252us 5.375us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 12.23% 255.474us 12.23% 255.474us 42.579us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.66% 34.672us 2.10% 43.902us 3.658us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.44% 9.230us 0.44% 9.230us 0.769us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.23% 46.611us 2.23% 46.611us 7.769us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.24% 4.930us 0.24% 4.930us 4.930us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 349.111us 1238.38% 349.111us 349.111us 1 + hf_kernels_rotary 23.24% 192.013us 99.32% 820.571us 820.571us 0.000us 0.00% 30.015us 30.015us 1 + _rotary_dba7d1e::apply_rotary 5.42% 44.795us 10.63% 87.866us 14.644us 17.632us 62.54% 17.632us 2.939us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 17.632us 62.54% 17.632us 2.939us 6 + aten::clone 2.69% 22.223us 60.09% 496.442us 82.740us 0.000us 0.00% 12.383us 2.064us 6 + aten::copy_ 4.60% 38.000us 53.48% 441.890us 73.648us 10.559us 37.46% 12.383us 2.064us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.559us 37.46% 10.559us 1.760us 6 + Activity Buffer Request 26.48% 218.816us 26.48% 218.816us 218.816us 1.824us 6.47% 1.824us 1.824us 1 + aten::empty_strided 3.91% 32.329us 3.91% 32.329us 5.388us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 22.40% 185.074us 22.40% 185.074us 30.846us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.04% 33.410us 5.36% 44.250us 3.688us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.31% 10.840us 1.31% 10.840us 0.903us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.21% 43.071us 5.21% 43.071us 7.178us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.68% 5.641us 0.68% 5.641us 5.641us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.090ms -Self CUDA time total: 27.998us +Self CPU time total: 826.212us +Self CUDA time total: 28.191us @@ -4381,23 +4383,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 346.557us 858.83% 346.557us 346.557us 1 - hf_kernels_rotary 7.80% 160.642us 99.76% 2.055ms 2.055ms 0.000us 0.00% 43.200us 43.200us 1 - _rotary_dba7d1e::apply_rotary 2.00% 41.122us 4.23% 87.123us 14.521us 23.424us 58.05% 23.424us 3.904us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 23.424us 58.05% 23.424us 3.904us 6 - aten::clone 1.11% 22.900us 85.69% 1.765ms 294.130us 0.000us 0.00% 19.776us 3.296us 6 - aten::copy_ 1.80% 37.091us 82.95% 1.708ms 284.737us 16.928us 41.95% 19.776us 3.296us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.928us 41.95% 16.928us 2.821us 6 - Activity Buffer Request 70.02% 1.442ms 70.02% 1.442ms 1.442ms 2.848us 7.06% 2.848us 2.848us 1 - aten::empty_strided 1.62% 33.460us 1.62% 33.460us 5.577us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 11.13% 229.194us 11.13% 229.194us 38.199us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.60% 33.049us 2.04% 42.051us 3.504us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.44% 9.002us 0.44% 9.002us 0.750us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.23% 46.001us 2.23% 46.001us 7.667us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.24% 4.950us 0.24% 4.950us 4.950us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 344.984us 852.93% 344.984us 344.984us 1 + hf_kernels_rotary 22.02% 168.975us 99.39% 762.759us 762.759us 0.000us 0.00% 43.263us 43.263us 1 + _rotary_dba7d1e::apply_rotary 5.75% 44.162us 11.18% 85.802us 14.300us 23.456us 57.99% 23.456us 3.909us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 23.456us 57.99% 23.456us 3.909us 6 + aten::clone 2.91% 22.350us 60.45% 463.932us 77.322us 0.000us 0.00% 19.807us 3.301us 6 + aten::copy_ 4.98% 38.249us 53.45% 410.170us 68.362us 16.991us 42.01% 19.807us 3.301us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.991us 42.01% 16.991us 2.832us 6 + Activity Buffer Request 24.55% 188.395us 24.55% 188.395us 188.395us 2.816us 6.96% 2.816us 2.816us 1 + aten::empty_strided 4.09% 31.412us 4.09% 31.412us 5.235us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 23.91% 183.526us 23.91% 183.526us 30.588us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.40% 33.790us 5.74% 44.050us 3.671us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.34% 10.260us 1.34% 10.260us 0.855us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.43% 41.640us 5.43% 41.640us 6.940us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.61% 4.661us 0.61% 4.661us 4.661us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.060ms -Self CUDA time total: 40.352us +Self CPU time total: 767.420us +Self CUDA time total: 40.447us @@ -4407,23 +4409,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 349.374us 446.91% 349.374us 349.374us 1 - hf_kernels_rotary 8.00% 163.391us 99.76% 2.039ms 2.039ms 0.000us 0.00% 90.720us 90.720us 1 - aten::clone 1.09% 22.181us 85.39% 1.745ms 290.833us 0.000us 0.00% 52.224us 8.704us 6 - aten::copy_ 1.85% 37.761us 82.69% 1.690ms 281.650us 39.680us 50.76% 52.224us 8.704us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 39.680us 50.76% 39.680us 6.613us 6 - _rotary_dba7d1e::apply_rotary 2.10% 42.834us 4.25% 86.883us 14.481us 38.496us 49.24% 38.496us 6.416us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 38.496us 49.24% 38.496us 6.416us 6 - Activity Buffer Request 69.78% 1.426ms 69.78% 1.426ms 1.426ms 12.544us 16.05% 12.544us 12.544us 1 - aten::empty_strided 1.61% 32.920us 1.61% 32.920us 5.487us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 11.06% 226.094us 11.06% 226.094us 37.682us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.62% 33.171us 2.12% 43.331us 3.611us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.50% 10.160us 0.50% 10.160us 0.847us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.16% 44.049us 2.16% 44.049us 7.341us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.24% 4.951us 0.24% 4.951us 4.951us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 347.453us 442.64% 347.453us 347.453us 1 + hf_kernels_rotary 20.37% 160.826us 99.39% 784.751us 784.751us 0.000us 0.00% 91.040us 91.040us 1 + aten::clone 2.83% 22.340us 62.44% 492.983us 82.164us 0.000us 0.00% 52.865us 8.811us 6 + aten::copy_ 4.65% 36.740us 55.30% 436.663us 72.777us 40.321us 51.37% 52.865us 8.811us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 40.321us 51.37% 40.321us 6.720us 6 + _rotary_dba7d1e::apply_rotary 5.74% 45.350us 11.00% 86.891us 14.482us 38.175us 48.63% 38.175us 6.362us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 38.175us 48.63% 38.175us 6.362us 6 + Activity Buffer Request 27.86% 219.946us 27.86% 219.946us 219.946us 12.544us 15.98% 12.544us 12.544us 1 + aten::empty_strided 4.30% 33.980us 4.30% 33.980us 5.663us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 22.79% 179.977us 22.79% 179.977us 29.996us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.35% 34.361us 5.58% 44.051us 3.671us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.23% 9.690us 1.23% 9.690us 0.808us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.26% 41.541us 5.26% 41.541us 6.924us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.61% 4.830us 0.61% 4.830us 4.830us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.044ms -Self CUDA time total: 78.176us +Self CPU time total: 789.581us +Self CUDA time total: 78.496us @@ -4433,23 +4435,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 333.879us 824.19% 333.879us 333.879us 1 - hf_kernels_rotary 18.73% 154.483us 99.41% 820.134us 820.134us 0.000us 0.00% 43.327us 43.327us 1 - _rotary_dba7d1e::apply_rotary 4.89% 40.361us 10.02% 82.702us 13.784us 23.422us 57.82% 23.422us 3.904us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 23.422us 57.82% 23.422us 3.904us 6 - aten::clone 2.46% 20.259us 65.56% 540.868us 90.145us 0.000us 0.00% 19.905us 3.317us 6 - aten::copy_ 4.70% 38.811us 59.16% 488.099us 81.350us 17.088us 42.18% 19.905us 3.317us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.088us 42.18% 17.088us 2.848us 6 - Activity Buffer Request 27.39% 225.944us 27.39% 225.944us 225.944us 2.817us 6.95% 2.817us 2.817us 1 - aten::empty_strided 3.94% 32.510us 3.94% 32.510us 5.418us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 27.07% 223.344us 27.07% 223.344us 37.224us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.93% 32.394us 5.10% 42.081us 3.507us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.17% 9.687us 1.17% 9.687us 0.807us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.13% 42.341us 5.13% 42.341us 7.057us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.59% 4.860us 0.59% 4.860us 4.860us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 347.324us 858.06% 347.324us 347.324us 1 + hf_kernels_rotary 8.65% 173.958us 99.77% 2.007ms 2.007ms 0.000us 0.00% 43.325us 43.325us 1 + _rotary_dba7d1e::apply_rotary 2.18% 43.910us 4.21% 84.770us 14.128us 23.423us 57.87% 23.423us 3.904us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 23.423us 57.87% 23.423us 3.904us 6 + aten::clone 1.35% 27.211us 84.83% 1.706ms 284.405us 0.000us 0.00% 19.902us 3.317us 6 + aten::copy_ 1.92% 38.681us 81.76% 1.645ms 274.138us 17.055us 42.13% 19.902us 3.317us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.055us 42.13% 17.055us 2.842us 6 + Activity Buffer Request 70.68% 1.422ms 70.68% 1.422ms 1.422ms 2.847us 7.03% 2.847us 2.847us 1 + aten::empty_strided 1.71% 34.392us 1.71% 34.392us 5.732us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 9.16% 184.363us 9.16% 184.363us 30.727us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.62% 32.593us 2.08% 41.861us 3.488us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.46% 9.268us 0.46% 9.268us 0.772us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 2.03% 40.860us 2.03% 40.860us 6.810us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.23% 4.670us 0.23% 4.670us 4.670us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 824.994us -Self CUDA time total: 40.510us +Self CPU time total: 2.012ms +Self CUDA time total: 40.478us @@ -4459,23 +4461,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 338.778us 450.33% 338.778us 338.778us 1 - hf_kernels_rotary 18.40% 151.937us 99.39% 820.824us 820.824us 0.000us 0.00% 85.723us 85.723us 1 - aten::clone 2.47% 20.430us 65.45% 540.538us 90.090us 0.000us 0.00% 47.293us 7.882us 6 - aten::copy_ 4.41% 36.400us 59.08% 487.928us 81.321us 36.798us 48.92% 47.293us 7.882us 6 - _rotary_dba7d1e::apply_rotary 4.89% 40.390us 10.51% 86.760us 14.460us 38.430us 51.08% 38.430us 6.405us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 38.430us 51.08% 38.430us 6.405us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 36.798us 48.92% 36.798us 6.133us 6 - Activity Buffer Request 27.74% 229.134us 27.74% 229.134us 229.134us 10.495us 13.95% 10.495us 10.495us 1 - aten::empty_strided 3.90% 32.180us 3.90% 32.180us 5.363us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 26.93% 222.394us 26.93% 222.394us 37.066us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.90% 32.180us 5.04% 41.589us 3.466us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.14% 9.409us 1.14% 9.409us 0.784us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.61% 46.370us 5.61% 46.370us 7.728us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.61% 5.040us 0.61% 5.040us 5.040us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 361.785us 476.45% 361.785us 361.785us 1 + hf_kernels_rotary 8.64% 176.662us 99.77% 2.040ms 2.040ms 0.000us 0.00% 86.685us 86.685us 1 + aten::clone 1.40% 28.682us 84.64% 1.731ms 288.486us 0.000us 0.00% 47.871us 7.979us 6 + aten::copy_ 1.80% 36.737us 81.55% 1.668ms 277.962us 37.119us 48.88% 47.871us 7.979us 6 + _rotary_dba7d1e::apply_rotary 2.24% 45.910us 4.34% 88.820us 14.803us 38.814us 51.12% 38.814us 6.469us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 38.814us 51.12% 38.814us 6.469us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 37.119us 48.88% 37.119us 6.187us 6 + Activity Buffer Request 70.82% 1.448ms 70.82% 1.448ms 1.448ms 10.752us 14.16% 10.752us 10.752us 1 + aten::empty_strided 1.69% 34.462us 1.69% 34.462us 5.744us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 8.93% 182.677us 8.93% 182.677us 30.446us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.66% 33.994us 2.15% 43.925us 3.660us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.49% 9.931us 0.49% 9.931us 0.828us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 2.10% 42.910us 2.10% 42.910us 7.152us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.23% 4.670us 0.23% 4.670us 4.670us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 825.864us -Self CUDA time total: 75.228us +Self CPU time total: 2.045ms +Self CUDA time total: 75.933us @@ -4485,23 +4487,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 338.815us 244.98% 338.815us 338.815us 1 - hf_kernels_rotary 17.96% 152.299us 99.45% 843.474us 843.474us 0.000us 0.00% 161.823us 161.823us 1 - aten::clone 2.40% 20.339us 66.32% 562.460us 93.743us 0.000us 0.00% 102.176us 17.029us 6 - aten::copy_ 4.27% 36.251us 60.21% 510.629us 85.105us 78.656us 56.87% 102.176us 17.029us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 78.656us 56.87% 78.656us 13.109us 6 - _rotary_dba7d1e::apply_rotary 4.86% 41.202us 10.23% 86.763us 14.460us 59.647us 43.13% 59.647us 9.941us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 59.647us 43.13% 59.647us 9.941us 6 - Activity Buffer Request 30.37% 257.584us 30.37% 257.584us 257.584us 23.520us 17.01% 23.520us 23.520us 1 - aten::empty_strided 3.71% 31.492us 3.71% 31.492us 5.249us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 25.56% 216.794us 25.56% 216.794us 36.132us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.89% 32.951us 4.95% 41.952us 3.496us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.06% 9.001us 1.06% 9.001us 0.750us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.37% 45.561us 5.37% 45.561us 7.594us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.55% 4.640us 0.55% 4.640us 4.640us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 373.629us 268.97% 373.629us 373.629us 1 + hf_kernels_rotary 8.95% 179.578us 99.78% 2.002ms 2.002ms 0.000us 0.00% 162.750us 162.750us 1 + aten::clone 1.48% 29.597us 83.94% 1.684ms 280.680us 0.000us 0.00% 102.944us 17.157us 6 + aten::copy_ 1.82% 36.553us 80.73% 1.620ms 269.962us 79.104us 56.95% 102.944us 17.157us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 79.104us 56.95% 79.104us 13.184us 6 + _rotary_dba7d1e::apply_rotary 2.30% 46.131us 4.57% 91.713us 15.285us 59.806us 43.05% 59.806us 9.968us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 59.806us 43.05% 59.806us 9.968us 6 + Activity Buffer Request 69.91% 1.403ms 69.91% 1.403ms 1.403ms 23.840us 17.16% 23.840us 23.840us 1 + aten::empty_strided 1.73% 34.712us 1.73% 34.712us 5.785us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 9.00% 180.563us 9.00% 180.563us 30.094us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.75% 35.198us 2.31% 46.409us 3.867us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.56% 11.211us 0.56% 11.211us 0.934us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 2.27% 45.582us 2.27% 45.582us 7.597us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.22% 4.510us 0.22% 4.510us 4.510us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 848.114us -Self CUDA time total: 138.303us +Self CPU time total: 2.006ms +Self CUDA time total: 138.910us @@ -4511,23 +4513,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 12.84% 152.812us 71.89% 855.575us 855.575us 0.000us 0.00% 769.625us 769.625us 1 - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 710.234us 101.16% 710.234us 710.234us 1 - aten::clone 1.76% 21.001us 48.07% 572.021us 95.337us 0.000us 0.00% 572.987us 95.498us 6 - aten::copy_ 3.15% 37.471us 43.65% 519.450us 86.575us 505.436us 71.99% 572.987us 95.498us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 505.436us 71.99% 505.436us 84.239us 6 - _rotary_dba7d1e::apply_rotary 3.42% 40.722us 7.33% 87.262us 14.544us 196.638us 28.01% 196.638us 32.773us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 196.638us 28.01% 196.638us 32.773us 6 - Activity Buffer Request 21.90% 260.665us 21.90% 260.665us 260.665us 67.551us 9.62% 67.551us 67.551us 1 - aten::empty_strided 2.65% 31.570us 2.65% 31.570us 5.262us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 18.60% 221.314us 18.60% 221.314us 36.886us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.82% 33.601us 3.65% 43.480us 3.623us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.83% 9.879us 0.83% 9.879us 0.823us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 3.91% 46.540us 3.91% 46.540us 7.757us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 28.11% 334.485us 28.11% 334.485us 334.485us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 7.56% 177.196us 86.68% 2.032ms 2.032ms 0.000us 0.00% 778.402us 778.402us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 717.248us 101.07% 717.248us 717.248us 1 + aten::clone 1.23% 28.772us 72.98% 1.711ms 285.141us 0.000us 0.00% 578.626us 96.438us 6 + aten::copy_ 1.64% 38.341us 70.23% 1.646ms 274.415us 509.889us 71.85% 578.626us 96.438us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 509.889us 71.85% 509.889us 84.982us 6 + _rotary_dba7d1e::apply_rotary 2.34% 54.801us 4.25% 99.591us 16.598us 199.776us 28.15% 199.776us 33.296us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 199.776us 28.15% 199.776us 33.296us 6 + Activity Buffer Request 60.86% 1.427ms 60.86% 1.427ms 1.427ms 68.737us 9.69% 68.737us 68.737us 1 + aten::empty_strided 1.52% 35.581us 1.52% 35.581us 5.930us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 7.74% 181.435us 7.74% 181.435us 30.239us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.41% 33.151us 1.89% 44.330us 3.694us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.48% 11.179us 0.48% 11.179us 0.932us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.91% 44.790us 1.91% 44.790us 7.465us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 13.32% 312.348us 13.32% 312.348us 312.348us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.190ms -Self CUDA time total: 702.074us +Self CPU time total: 2.344ms +Self CUDA time total: 709.665us @@ -4537,23 +4539,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 340.957us 1280.69% 340.957us 340.957us 1 - hf_kernels_rotary 17.85% 154.192us 99.45% 858.915us 858.915us 0.000us 0.00% 27.935us 27.935us 1 - _rotary_dba7d1e::apply_rotary 4.82% 41.593us 10.09% 87.173us 14.529us 18.719us 70.31% 18.719us 3.120us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 18.719us 70.31% 18.719us 3.120us 6 - aten::clone 2.51% 21.701us 66.67% 575.779us 95.963us 0.000us 0.00% 9.216us 1.536us 6 - aten::copy_ 4.05% 34.978us 60.54% 522.828us 87.138us 7.904us 29.69% 9.216us 1.536us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.904us 29.69% 7.904us 1.317us 6 - Activity Buffer Request 30.68% 265.004us 30.68% 265.004us 265.004us 1.312us 4.93% 1.312us 1.312us 1 - aten::empty_strided 3.62% 31.250us 3.62% 31.250us 5.208us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 25.80% 222.846us 25.80% 222.846us 37.141us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.77% 32.522us 4.84% 41.771us 3.481us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.07% 9.249us 1.07% 9.249us 0.771us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.28% 45.580us 5.28% 45.580us 7.597us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.55% 4.760us 0.55% 4.760us 4.760us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 349.629us 1313.11% 349.629us 349.629us 1 + hf_kernels_rotary 8.75% 174.875us 99.76% 1.994ms 1.994ms 0.000us 0.00% 27.938us 27.938us 1 + _rotary_dba7d1e::apply_rotary 2.16% 43.200us 4.40% 87.900us 14.650us 18.754us 70.43% 18.754us 3.126us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 18.754us 70.43% 18.754us 3.126us 6 + aten::clone 1.44% 28.720us 84.48% 1.688ms 281.365us 0.000us 0.00% 9.184us 1.531us 6 + aten::copy_ 1.82% 36.432us 81.36% 1.626ms 271.003us 7.872us 29.57% 9.184us 1.531us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.872us 29.57% 7.872us 1.312us 6 + Activity Buffer Request 70.53% 1.410ms 70.53% 1.410ms 1.410ms 1.312us 4.93% 1.312us 1.312us 1 + aten::empty_strided 1.67% 33.452us 1.67% 33.452us 5.575us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 9.01% 180.083us 9.01% 180.083us 30.014us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.63% 32.560us 2.14% 42.684us 3.557us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.51% 10.124us 0.51% 10.124us 0.844us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 2.24% 44.700us 2.24% 44.700us 7.450us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.24% 4.780us 0.24% 4.780us 4.780us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 863.675us -Self CUDA time total: 26.623us +Self CPU time total: 1.998ms +Self CUDA time total: 26.626us @@ -4563,23 +4565,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 331.838us 1247.93% 331.838us 331.838us 1 - hf_kernels_rotary 18.40% 149.763us 99.33% 808.424us 808.424us 0.000us 0.00% 27.871us 27.871us 1 - _rotary_dba7d1e::apply_rotary 5.12% 41.640us 10.68% 86.941us 14.490us 18.879us 71.00% 18.879us 3.147us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 18.879us 71.00% 18.879us 3.147us 6 - aten::clone 2.56% 20.830us 65.24% 531.000us 88.500us 0.000us 0.00% 8.992us 1.499us 6 - aten::copy_ 4.49% 36.550us 58.98% 480.009us 80.001us 7.712us 29.00% 8.992us 1.499us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.712us 29.00% 7.712us 1.285us 6 - Activity Buffer Request 28.18% 229.375us 28.18% 229.375us 229.375us 1.280us 4.81% 1.280us 1.280us 1 - aten::empty_strided 3.71% 30.161us 3.71% 30.161us 5.027us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 26.30% 214.084us 26.30% 214.084us 35.681us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.92% 31.890us 5.00% 40.720us 3.393us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.08% 8.830us 1.08% 8.830us 0.736us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.57% 45.301us 5.57% 45.301us 7.550us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.67% 5.440us 0.67% 5.440us 5.440us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 344.698us 1282.22% 344.698us 344.698us 1 + hf_kernels_rotary 22.61% 152.757us 99.23% 670.538us 670.538us 0.000us 0.00% 28.195us 28.195us 1 + _rotary_dba7d1e::apply_rotary 6.64% 44.870us 12.97% 87.630us 14.605us 19.009us 70.71% 19.009us 3.168us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 19.009us 70.71% 19.009us 3.168us 6 + aten::clone 3.38% 22.839us 57.25% 386.869us 64.478us 0.000us 0.00% 9.186us 1.531us 6 + aten::copy_ 5.63% 38.041us 49.11% 331.829us 55.305us 7.874us 29.29% 9.186us 1.531us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.874us 29.29% 7.874us 1.312us 6 + Activity Buffer Request 16.48% 111.363us 16.48% 111.363us 111.363us 1.312us 4.88% 1.312us 1.312us 1 + aten::empty_strided 4.77% 32.201us 4.77% 32.201us 5.367us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 27.00% 182.425us 27.00% 182.425us 30.404us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.90% 33.085us 6.41% 43.282us 3.607us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.51% 10.197us 1.51% 10.197us 0.850us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 6.33% 42.760us 6.33% 42.760us 7.127us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.77% 5.200us 0.77% 5.200us 5.200us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 813.864us -Self CUDA time total: 26.591us +Self CPU time total: 675.738us +Self CUDA time total: 26.883us @@ -4589,23 +4591,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 353.852us 1157.89% 353.852us 353.852us 1 - hf_kernels_rotary 7.66% 156.034us 99.77% 2.033ms 2.033ms 0.000us 0.00% 32.320us 32.320us 1 - _rotary_dba7d1e::apply_rotary 2.04% 41.512us 4.26% 86.762us 14.460us 20.159us 65.97% 20.159us 3.360us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 20.159us 65.97% 20.159us 3.360us 6 - aten::clone 1.10% 22.431us 85.66% 1.746ms 290.955us 0.000us 0.00% 12.161us 2.027us 6 - aten::copy_ 2.23% 45.431us 82.85% 1.688ms 281.408us 10.401us 34.03% 12.161us 2.027us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.401us 34.03% 10.401us 1.734us 6 - Activity Buffer Request 70.07% 1.428ms 70.07% 1.428ms 1.428ms 1.760us 5.76% 1.760us 1.760us 1 - aten::empty_strided 1.71% 34.849us 1.71% 34.849us 5.808us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 10.54% 214.913us 10.54% 214.913us 35.819us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.68% 34.241us 2.20% 44.770us 3.731us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.52% 10.529us 0.52% 10.529us 0.877us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.22% 45.250us 2.22% 45.250us 7.542us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.23% 4.770us 0.23% 4.770us 4.770us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 350.004us 1141.75% 350.004us 350.004us 1 + hf_kernels_rotary 19.05% 154.214us 99.36% 804.261us 804.261us 0.000us 0.00% 32.414us 32.414us 1 + _rotary_dba7d1e::apply_rotary 5.47% 44.240us 10.98% 88.910us 14.818us 20.064us 65.45% 20.064us 3.344us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 20.064us 65.45% 20.064us 3.344us 6 + aten::clone 3.02% 24.421us 63.80% 516.433us 86.072us 0.000us 0.00% 12.350us 2.058us 6 + aten::copy_ 4.66% 37.732us 56.69% 458.901us 76.483us 10.591us 34.55% 12.350us 2.058us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.591us 34.55% 10.591us 1.765us 6 + Activity Buffer Request 29.69% 240.306us 29.69% 240.306us 240.306us 1.759us 5.74% 1.759us 1.759us 1 + aten::empty_strided 4.09% 33.111us 4.09% 33.111us 5.518us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 22.34% 180.863us 22.34% 180.863us 30.144us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.15% 33.594us 5.52% 44.704us 3.725us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.37% 11.110us 1.37% 11.110us 0.926us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.52% 44.670us 5.52% 44.670us 7.445us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.64% 5.201us 0.64% 5.201us 5.201us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.038ms -Self CUDA time total: 30.560us +Self CPU time total: 809.462us +Self CUDA time total: 30.655us @@ -4615,23 +4617,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 367.612us 860.51% 367.612us 367.612us 1 - hf_kernels_rotary 7.69% 158.003us 99.76% 2.050ms 2.050ms 0.000us 0.00% 45.568us 45.568us 1 - _rotary_dba7d1e::apply_rotary 2.04% 41.961us 4.25% 87.391us 14.565us 25.759us 60.30% 25.759us 4.293us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 25.759us 60.30% 25.759us 4.293us 6 - aten::clone 1.11% 22.799us 84.82% 1.743ms 290.528us 0.000us 0.00% 19.809us 3.301us 6 - aten::copy_ 1.88% 38.712us 82.12% 1.688ms 281.267us 16.961us 39.70% 19.809us 3.301us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.961us 39.70% 16.961us 2.827us 6 - Activity Buffer Request 69.69% 1.432ms 69.69% 1.432ms 1.432ms 2.848us 6.67% 2.848us 2.848us 1 - aten::empty_strided 1.59% 32.771us 1.59% 32.771us 5.462us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 10.54% 216.613us 10.54% 216.613us 36.102us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.51% 51.572us 3.00% 61.672us 5.139us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.49% 10.100us 0.49% 10.100us 0.842us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.21% 45.430us 2.21% 45.430us 7.572us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.24% 4.849us 0.24% 4.849us 4.849us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 350.355us 822.64% 350.355us 350.355us 1 + hf_kernels_rotary 19.55% 155.605us 99.35% 790.981us 790.981us 0.000us 0.00% 45.469us 45.469us 1 + _rotary_dba7d1e::apply_rotary 5.55% 44.191us 11.02% 87.731us 14.622us 25.565us 60.03% 25.565us 4.261us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 25.565us 60.03% 25.565us 4.261us 6 + aten::clone 2.81% 22.389us 63.13% 502.593us 83.766us 0.000us 0.00% 19.904us 3.317us 6 + aten::copy_ 4.90% 39.043us 56.13% 446.833us 74.472us 17.024us 39.97% 19.904us 3.317us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.024us 39.97% 17.024us 2.837us 6 + Activity Buffer Request 28.37% 225.886us 28.37% 225.886us 225.886us 2.880us 6.76% 2.880us 2.880us 1 + aten::empty_strided 4.19% 33.371us 4.19% 33.371us 5.562us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 22.85% 181.904us 22.85% 181.904us 30.317us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.29% 34.142us 5.66% 45.052us 3.754us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.37% 10.910us 1.37% 10.910us 0.909us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.47% 43.540us 5.47% 43.540us 7.257us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.65% 5.140us 0.65% 5.140us 5.140us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.055ms -Self CUDA time total: 42.720us +Self CPU time total: 796.121us +Self CUDA time total: 42.589us @@ -4641,23 +4643,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 347.614us 1135.14% 347.614us 347.614us 1 - hf_kernels_rotary 7.64% 156.781us 99.77% 2.046ms 2.046ms 0.000us 0.00% 32.383us 32.383us 1 - _rotary_dba7d1e::apply_rotary 2.01% 41.122us 4.16% 85.392us 14.232us 20.223us 66.04% 20.223us 3.370us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 20.223us 66.04% 20.223us 3.370us 6 - aten::clone 1.11% 22.841us 85.79% 1.759ms 293.223us 0.000us 0.00% 12.160us 2.027us 6 - aten::copy_ 1.81% 37.030us 83.06% 1.703ms 283.910us 10.400us 33.96% 12.160us 2.027us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.400us 33.96% 10.400us 1.733us 6 - Activity Buffer Request 70.68% 1.449ms 70.68% 1.449ms 1.449ms 1.760us 5.75% 1.760us 1.760us 1 - aten::empty_strided 1.61% 33.040us 1.61% 33.040us 5.507us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 10.58% 216.984us 10.58% 216.984us 36.164us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.70% 34.784us 2.17% 44.532us 3.711us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.48% 9.748us 0.48% 9.748us 0.812us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.16% 44.270us 2.16% 44.270us 7.378us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.23% 4.760us 0.23% 4.760us 4.760us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 344.951us 1133.59% 344.951us 344.951us 1 + hf_kernels_rotary 19.05% 153.418us 99.42% 800.680us 800.680us 0.000us 0.00% 32.125us 32.125us 1 + _rotary_dba7d1e::apply_rotary 5.43% 43.718us 10.83% 87.180us 14.530us 20.095us 66.04% 20.095us 3.349us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 20.095us 66.04% 20.095us 3.349us 6 + aten::clone 2.75% 22.180us 64.20% 517.012us 86.169us 0.000us 0.00% 12.030us 2.005us 6 + aten::copy_ 4.82% 38.813us 57.22% 460.802us 76.800us 10.335us 33.96% 12.030us 2.005us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.335us 33.96% 10.335us 1.722us 6 + Activity Buffer Request 30.13% 242.666us 30.13% 242.666us 242.666us 1.695us 5.57% 1.695us 1.695us 1 + aten::empty_strided 4.23% 34.030us 4.23% 34.030us 5.672us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 22.27% 179.323us 22.27% 179.323us 29.887us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.11% 33.131us 5.35% 43.070us 3.589us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.23% 9.939us 1.23% 9.939us 0.828us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.40% 43.462us 5.40% 43.462us 7.244us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.58% 4.660us 0.58% 4.660us 4.660us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.051ms -Self CUDA time total: 30.623us +Self CPU time total: 805.340us +Self CUDA time total: 30.430us @@ -4667,23 +4669,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 328.444us 771.23% 328.444us 328.444us 1 - hf_kernels_rotary 18.84% 150.934us 99.38% 796.084us 796.084us 0.000us 0.00% 45.403us 45.403us 1 - _rotary_dba7d1e::apply_rotary 5.06% 40.529us 10.59% 84.811us 14.135us 25.693us 60.33% 25.693us 4.282us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 25.693us 60.33% 25.693us 4.282us 6 - aten::clone 2.49% 19.929us 64.90% 519.868us 86.645us 0.000us 0.00% 19.710us 3.285us 6 - aten::copy_ 4.41% 35.321us 58.57% 469.148us 78.191us 16.894us 39.67% 19.710us 3.285us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.894us 39.67% 16.894us 2.816us 6 - Activity Buffer Request 27.59% 221.013us 27.59% 221.013us 221.013us 2.816us 6.61% 2.816us 2.816us 1 - aten::empty_strided 3.84% 30.791us 3.84% 30.791us 5.132us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 26.57% 212.814us 26.57% 212.814us 35.469us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.92% 31.361us 5.05% 40.471us 3.373us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.14% 9.110us 1.14% 9.110us 0.759us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.53% 44.282us 5.53% 44.282us 7.380us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.62% 4.951us 0.62% 4.951us 4.951us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 358.905us 840.15% 358.905us 358.905us 1 + hf_kernels_rotary 15.26% 159.123us 99.55% 1.038ms 1.038ms 0.000us 0.00% 45.598us 45.598us 1 + _rotary_dba7d1e::apply_rotary 4.27% 44.490us 8.42% 87.790us 14.632us 25.600us 59.93% 25.600us 4.267us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 25.600us 59.93% 25.600us 4.267us 6 + aten::clone 2.23% 23.211us 71.54% 746.059us 124.343us 0.000us 0.00% 19.998us 3.333us 6 + aten::copy_ 3.70% 38.572us 65.96% 687.817us 114.636us 17.119us 40.07% 19.998us 3.333us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.119us 40.07% 17.119us 2.853us 6 + Activity Buffer Request 44.90% 468.242us 44.90% 468.242us 468.242us 2.879us 6.74% 2.879us 2.879us 1 + aten::empty_strided 3.36% 35.031us 3.36% 35.031us 5.838us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 17.36% 181.003us 17.36% 181.003us 30.167us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.32% 34.604us 4.33% 45.135us 3.761us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.01% 10.531us 1.01% 10.531us 0.878us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 4.15% 43.300us 4.15% 43.300us 7.217us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.45% 4.700us 0.45% 4.700us 4.700us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 801.035us -Self CUDA time total: 42.587us +Self CPU time total: 1.043ms +Self CUDA time total: 42.719us @@ -4693,23 +4695,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 338.910us 380.70% 338.910us 338.910us 1 - hf_kernels_rotary 14.14% 150.935us 99.54% 1.062ms 1.062ms 0.000us 0.00% 104.734us 104.734us 1 - aten::clone 2.00% 21.371us 73.53% 784.703us 130.784us 0.000us 0.00% 63.775us 10.629us 6 - aten::copy_ 3.58% 38.219us 68.59% 731.952us 121.992us 48.063us 53.99% 63.775us 10.629us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 48.063us 53.99% 48.063us 8.010us 6 - _rotary_dba7d1e::apply_rotary 3.85% 41.059us 8.05% 85.950us 14.325us 40.959us 46.01% 40.959us 6.826us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 40.959us 46.01% 40.959us 6.826us 6 - Activity Buffer Request 44.86% 478.699us 44.86% 478.699us 478.699us 15.712us 17.65% 15.712us 15.712us 1 - aten::empty_strided 2.94% 31.380us 2.94% 31.380us 5.230us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 20.15% 215.034us 20.15% 215.034us 35.839us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.96% 31.591us 3.81% 40.690us 3.391us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.85% 9.099us 0.85% 9.099us 0.758us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 4.21% 44.891us 4.21% 44.891us 7.482us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.46% 4.900us 0.46% 4.900us 4.900us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 383.638us 432.19% 383.638us 383.638us 1 + hf_kernels_rotary 19.20% 158.364us 99.38% 819.611us 819.611us 0.000us 0.00% 103.870us 103.870us 1 + aten::clone 2.74% 22.581us 61.51% 507.313us 84.552us 0.000us 0.00% 63.135us 10.522us 6 + aten::copy_ 4.83% 39.811us 54.76% 451.622us 75.270us 48.031us 54.11% 63.135us 10.522us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 48.031us 54.11% 48.031us 8.005us 6 + _rotary_dba7d1e::apply_rotary 5.49% 45.243us 13.16% 108.504us 18.084us 40.735us 45.89% 40.735us 6.789us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 40.735us 45.89% 40.735us 6.789us 6 + Activity Buffer Request 27.50% 226.825us 27.50% 226.825us 226.825us 15.104us 17.02% 15.104us 15.104us 1 + aten::empty_strided 4.01% 33.110us 4.01% 33.110us 5.518us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 22.43% 184.986us 22.43% 184.986us 30.831us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.25% 35.021us 5.51% 45.430us 3.786us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.26% 10.409us 1.26% 10.409us 0.867us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 7.67% 63.261us 7.67% 63.261us 10.543us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.62% 5.141us 0.62% 5.141us 5.141us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.067ms -Self CUDA time total: 89.022us +Self CPU time total: 824.752us +Self CUDA time total: 88.766us @@ -4719,23 +4721,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 336.057us 230.21% 336.057us 336.057us 1 - hf_kernels_rotary 18.72% 149.775us 99.40% 795.224us 795.224us 0.000us 0.00% 169.949us 169.949us 1 - aten::clone 2.52% 20.180us 65.04% 520.348us 86.725us 0.000us 0.00% 106.527us 17.755us 6 - aten::copy_ 4.49% 35.890us 58.61% 468.868us 78.145us 82.559us 56.55% 106.527us 17.755us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 82.559us 56.55% 82.559us 13.760us 6 - _rotary_dba7d1e::apply_rotary 5.12% 40.981us 10.49% 83.942us 13.990us 63.422us 43.45% 63.422us 10.570us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 63.422us 43.45% 63.422us 10.570us 6 - Activity Buffer Request 27.82% 222.544us 27.82% 222.544us 222.544us 23.968us 16.42% 23.968us 23.968us 1 - aten::empty_strided 3.91% 31.300us 3.91% 31.300us 5.217us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 26.30% 210.434us 26.30% 210.434us 35.072us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.94% 31.518us 5.14% 41.159us 3.430us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.21% 9.641us 1.21% 9.641us 0.803us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.37% 42.961us 5.37% 42.961us 7.160us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.60% 4.790us 0.60% 4.790us 4.790us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 359.259us 247.18% 359.259us 359.259us 1 + hf_kernels_rotary 19.06% 158.337us 99.39% 825.781us 825.781us 0.000us 0.00% 168.829us 168.829us 1 + aten::clone 2.83% 23.549us 64.09% 532.493us 88.749us 0.000us 0.00% 105.470us 17.578us 6 + aten::copy_ 4.58% 38.013us 57.29% 475.972us 79.329us 81.982us 56.41% 105.470us 17.578us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 81.982us 56.41% 81.982us 13.664us 6 + _rotary_dba7d1e::apply_rotary 5.47% 45.451us 10.86% 90.251us 15.042us 63.359us 43.59% 63.359us 10.560us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 63.359us 43.59% 63.359us 10.560us 6 + Activity Buffer Request 31.29% 259.966us 31.29% 259.966us 259.966us 23.488us 16.16% 23.488us 23.488us 1 + aten::empty_strided 3.97% 32.972us 3.97% 32.972us 5.495us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 21.42% 177.993us 21.42% 177.993us 29.665us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.19% 34.839us 5.38% 44.700us 3.725us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.19% 9.861us 1.19% 9.861us 0.822us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.39% 44.800us 5.39% 44.800us 7.467us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.61% 5.100us 0.61% 5.100us 5.100us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 800.014us -Self CUDA time total: 145.981us +Self CPU time total: 830.881us +Self CUDA time total: 145.341us @@ -4745,23 +4747,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 339.836us 451.90% 339.836us 339.836us 1 - hf_kernels_rotary 18.57% 150.269us 99.38% 804.154us 804.154us 0.000us 0.00% 81.986us 81.986us 1 - _rotary_dba7d1e::apply_rotary 4.99% 40.401us 10.49% 84.862us 14.144us 41.601us 55.32% 41.601us 6.933us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 41.601us 55.32% 41.601us 6.933us 6 - aten::clone 2.54% 20.532us 64.81% 524.439us 87.406us 0.000us 0.00% 40.385us 6.731us 6 - aten::copy_ 4.41% 35.708us 58.24% 471.217us 78.536us 33.601us 44.68% 40.385us 6.731us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 33.601us 44.68% 33.601us 5.600us 6 - Activity Buffer Request 27.71% 224.174us 27.71% 224.174us 224.174us 6.784us 9.02% 6.784us 6.784us 1 - aten::empty_strided 4.04% 32.690us 4.04% 32.690us 5.448us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 26.12% 211.335us 26.12% 211.335us 35.223us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.32% 34.924us 5.51% 44.584us 3.715us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.19% 9.660us 1.19% 9.660us 0.805us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.49% 44.461us 5.49% 44.461us 7.410us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.62% 4.981us 0.62% 4.981us 4.981us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 385.725us 509.05% 385.725us 385.725us 1 + hf_kernels_rotary 8.62% 176.456us 99.78% 2.043ms 2.043ms 0.000us 0.00% 82.558us 82.558us 1 + _rotary_dba7d1e::apply_rotary 2.32% 47.603us 4.41% 90.273us 15.045us 41.694us 55.02% 41.694us 6.949us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 41.694us 55.02% 41.694us 6.949us 6 + aten::clone 1.42% 29.000us 84.54% 1.731ms 288.534us 0.000us 0.00% 40.864us 6.811us 6 + aten::copy_ 1.93% 39.552us 80.14% 1.641ms 273.497us 34.080us 44.98% 40.864us 6.811us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 34.080us 44.98% 34.080us 5.680us 6 + Activity Buffer Request 69.16% 1.416ms 69.16% 1.416ms 1.416ms 6.784us 8.95% 6.784us 6.784us 1 + aten::empty_strided 2.99% 61.221us 2.99% 61.221us 10.204us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 9.05% 185.224us 9.05% 185.224us 30.871us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.69% 34.591us 2.21% 45.260us 3.772us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.52% 10.669us 0.52% 10.669us 0.889us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 2.08% 42.670us 2.08% 42.670us 7.112us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.22% 4.530us 0.22% 4.530us 4.530us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 809.135us -Self CUDA time total: 75.202us +Self CPU time total: 2.048ms +Self CUDA time total: 75.774us @@ -4771,23 +4773,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 372.859us 256.14% 372.859us 372.859us 1 - hf_kernels_rotary 18.64% 161.451us 99.43% 861.125us 861.125us 0.000us 0.00% 169.279us 169.279us 1 - aten::clone 2.47% 21.401us 63.58% 550.631us 91.772us 0.000us 0.00% 105.373us 17.562us 6 - aten::copy_ 4.30% 37.239us 57.31% 496.359us 82.727us 81.662us 56.10% 105.373us 17.562us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 81.662us 56.10% 81.662us 13.610us 6 - _rotary_dba7d1e::apply_rotary 5.12% 44.341us 12.24% 106.023us 17.671us 63.906us 43.90% 63.906us 10.651us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 63.906us 43.90% 63.906us 10.651us 6 - Activity Buffer Request 28.62% 247.854us 28.62% 247.854us 247.854us 23.711us 16.29% 23.711us 23.711us 1 - aten::empty_strided 3.80% 32.871us 3.80% 32.871us 5.479us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 24.39% 211.266us 24.39% 211.266us 35.211us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.88% 33.609us 4.97% 43.020us 3.585us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.09% 9.411us 1.09% 9.411us 0.784us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 7.12% 61.682us 7.12% 61.682us 10.280us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.57% 4.969us 0.57% 4.969us 4.969us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 368.925us 253.94% 368.925us 368.925us 1 + hf_kernels_rotary 8.62% 177.641us 99.74% 2.055ms 2.055ms 0.000us 0.00% 169.118us 169.118us 1 + aten::clone 1.42% 29.322us 84.62% 1.743ms 290.539us 0.000us 0.00% 105.470us 17.578us 6 + aten::copy_ 1.92% 39.462us 81.52% 1.679ms 279.897us 81.631us 56.19% 105.470us 17.578us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 81.631us 56.19% 81.631us 13.605us 6 + _rotary_dba7d1e::apply_rotary 2.27% 46.683us 4.40% 90.665us 15.111us 63.648us 43.81% 63.648us 10.608us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 63.648us 43.81% 63.648us 10.608us 6 + Activity Buffer Request 70.79% 1.458ms 70.79% 1.458ms 1.458ms 23.839us 16.41% 23.839us 23.839us 1 + aten::empty_strided 1.68% 34.530us 1.68% 34.530us 5.755us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 8.81% 181.504us 8.81% 181.504us 30.251us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.62% 33.289us 2.09% 43.080us 3.590us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.48% 9.791us 0.48% 9.791us 0.816us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 2.13% 43.982us 2.13% 43.982us 7.330us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.26% 5.450us 0.26% 5.450us 5.450us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 866.094us -Self CUDA time total: 145.568us +Self CPU time total: 2.060ms +Self CUDA time total: 145.279us @@ -4797,23 +4799,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 13.02% 148.583us 72.32% 825.404us 825.404us 0.000us 0.00% 745.510us 745.510us 1 - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 687.015us 101.19% 687.015us 687.015us 1 - aten::clone 1.76% 20.130us 47.96% 547.368us 91.228us 0.000us 0.00% 556.292us 92.715us 6 - aten::copy_ 3.18% 36.280us 43.26% 493.818us 82.303us 489.699us 72.13% 556.292us 92.715us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 489.699us 72.13% 489.699us 81.617us 6 - _rotary_dba7d1e::apply_rotary 3.57% 40.732us 7.58% 86.552us 14.425us 189.218us 27.87% 189.218us 31.536us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 189.218us 27.87% 189.218us 31.536us 6 - Activity Buffer Request 21.89% 249.905us 21.89% 249.905us 249.905us 66.593us 9.81% 66.593us 66.593us 1 - aten::empty_strided 2.93% 33.420us 2.93% 33.420us 5.570us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 18.19% 207.633us 18.19% 207.633us 34.606us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.92% 33.351us 3.76% 42.901us 3.575us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.84% 9.550us 0.84% 9.550us 0.796us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 4.01% 45.820us 4.01% 45.820us 7.637us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 27.68% 315.986us 27.68% 315.986us 315.986us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 20.72% 223.838us 78.32% 845.992us 845.992us 0.000us 0.00% 747.476us 747.476us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 688.117us 101.15% 688.117us 688.117us 1 + aten::clone 2.05% 22.091us 45.23% 488.522us 81.420us 0.000us 0.00% 558.423us 93.070us 6 + aten::copy_ 3.67% 39.650us 40.20% 434.190us 72.365us 491.256us 72.21% 558.423us 93.070us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 491.256us 72.21% 491.256us 81.876us 6 + _rotary_dba7d1e::apply_rotary 4.18% 45.161us 8.45% 91.252us 15.209us 189.053us 27.79% 189.053us 31.509us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 189.053us 27.79% 189.053us 31.509us 6 + Activity Buffer Request 19.62% 211.896us 19.62% 211.896us 211.896us 67.167us 9.87% 67.167us 67.167us 1 + aten::empty_strided 2.98% 32.241us 2.98% 32.241us 5.374us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 16.91% 182.644us 16.91% 182.644us 30.441us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.05% 32.939us 3.92% 42.380us 3.532us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.87% 9.441us 0.87% 9.441us 0.787us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 4.27% 46.091us 4.27% 46.091us 7.682us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 21.68% 234.186us 21.68% 234.186us 234.186us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.141ms -Self CUDA time total: 678.917us +Self CPU time total: 1.080ms +Self CUDA time total: 680.309us @@ -4823,23 +4825,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 5.26% 153.062us 28.60% 832.074us 832.074us 0.000us 0.00% 2.627ms 2.627ms 1 - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 2.451ms 100.32% 2.451ms 2.451ms 1 - aten::clone 0.71% 20.751us 18.92% 550.432us 91.739us 0.000us 0.00% 1.403ms 233.752us 6 - aten::copy_ 1.33% 38.628us 17.10% 497.389us 82.898us 1.219ms 49.87% 1.403ms 233.752us 6 - _rotary_dba7d1e::apply_rotary 1.42% 41.449us 2.89% 84.050us 14.008us 1.225ms 50.13% 1.225ms 204.141us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 1.225ms 50.13% 1.225ms 204.141us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 1.219ms 49.87% 1.219ms 203.112us 6 - Activity Buffer Request 8.62% 250.725us 8.62% 250.725us 250.725us 183.838us 7.52% 183.838us 183.838us 1 - aten::empty_strided 1.11% 32.292us 1.11% 32.292us 5.382us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 7.15% 208.036us 7.15% 208.036us 34.673us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.18% 34.219us 1.53% 44.530us 3.711us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.35% 10.311us 0.35% 10.311us 0.859us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 1.46% 42.601us 1.46% 42.601us 7.100us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 71.40% 2.077ms 71.40% 2.077ms 2.077ms 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 5.41% 154.946us 27.83% 797.061us 797.061us 0.000us 0.00% 2.625ms 2.625ms 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 2.453ms 100.31% 2.453ms 2.453ms 1 + aten::clone 0.79% 22.601us 17.83% 510.683us 85.114us 0.000us 0.00% 1.396ms 232.586us 6 + aten::copy_ 1.43% 40.940us 15.89% 455.120us 75.853us 1.216ms 49.74% 1.396ms 232.586us 6 + _rotary_dba7d1e::apply_rotary 1.59% 45.590us 3.06% 87.640us 14.607us 1.229ms 50.26% 1.229ms 204.885us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 1.229ms 50.26% 1.229ms 204.885us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 1.216ms 49.74% 1.216ms 202.730us 6 + Activity Buffer Request 7.23% 207.076us 7.23% 207.076us 207.076us 179.136us 7.32% 179.136us 179.136us 1 + aten::empty_strided 1.15% 32.962us 1.15% 32.962us 5.494us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 7.23% 207.104us 7.23% 207.104us 34.517us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.15% 33.011us 1.53% 43.792us 3.649us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.38% 10.781us 0.38% 10.781us 0.898us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 1.47% 42.050us 1.47% 42.050us 7.008us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 72.17% 2.067ms 72.17% 2.067ms 2.067ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.909ms -Self CUDA time total: 2.444ms +Self CPU time total: 2.864ms +Self CUDA time total: 2.446ms impl wl p50(ms) ok @@ -4848,8 +4850,8 @@ hf_kernels_rotary cuda_B1_S128_H32_D64_R32 0.09 True hf_kernels_rotary cuda_B1_S128_H8_D128_R64 0.09 True hf_kernels_rotary cuda_B1_S128_H8_D64_R32 0.08 True hf_kernels_rotary cuda_B1_S2048_H32_D128_R64 0.26 True -hf_kernels_rotary cuda_B1_S2048_H32_D64_R32 0.09 True -hf_kernels_rotary cuda_B1_S2048_H8_D128_R64 0.09 True +hf_kernels_rotary cuda_B1_S2048_H32_D64_R32 0.10 True +hf_kernels_rotary cuda_B1_S2048_H8_D128_R64 0.10 True hf_kernels_rotary cuda_B1_S2048_H8_D64_R32 0.09 True hf_kernels_rotary cuda_B1_S512_H32_D128_R64 0.09 True hf_kernels_rotary cuda_B1_S512_H32_D64_R32 0.09 True @@ -4860,7 +4862,7 @@ hf_kernels_rotary cuda_B2_S128_H32_D64_R32 0.09 True hf_kernels_rotary cuda_B2_S128_H8_D128_R64 0.09 True hf_kernels_rotary cuda_B2_S128_H8_D64_R32 0.09 True hf_kernels_rotary cuda_B2_S2048_H32_D128_R64 0.85 True -hf_kernels_rotary cuda_B2_S2048_H32_D64_R32 0.26 True +hf_kernels_rotary cuda_B2_S2048_H32_D64_R32 0.27 True hf_kernels_rotary cuda_B2_S2048_H8_D128_R64 0.09 True hf_kernels_rotary cuda_B2_S2048_H8_D64_R32 0.09 True hf_kernels_rotary cuda_B2_S512_H32_D128_R64 0.09 True @@ -4871,14 +4873,12 @@ hf_kernels_rotary cuda_B2_S512_H8_D64_R32 0.09 True
▶ UV Install Logs
Fetching 5 files: 0%| | 0/5 [00:00<?, ?it/s] -Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 15.33it/s] -Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 15.31it/s]
+Fetching 5 files: 60%|██████ | 3/5 [00:00<00:00, 28.46it/s] +Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 9.80it/s]

Artifacts:

rotary.jsonl diff --git a/rotary/impls/torch_rotary.html b/rotary/impls/torch_rotary.html index 27d1faf03e42ae5d5ac730cfd03392b62eb2b62f..d7b34676102680b464b702d7de0525c0d9d460d2 100644 --- a/rotary/impls/torch_rotary.html +++ b/rotary/impls/torch_rotary.html @@ -4106,11 +4106,11 @@ body[data-tool="eraser"] .main-content { ▼ output ▶ uv-logs | -Cell: nv | 0.21s +Cell: nv | 0.23s | Raw -GitHub +GitHub
@@ -4122,7 +4122,7 @@ Cell: nv | 0.21s
-
Thu Oct 30 15:52:23 2025       
+
Fri Oct 31 20:00:00 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -4131,7 +4131,7 @@ Cell: nv | 0.21s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   30C    P0             76W /  350W |       0MiB /  46068MiB |     11%      Default |
+| N/A   32C    P0            101W /  350W |       0MiB /  46068MiB |    100%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -4153,13 +4153,13 @@ Cell: nv | 0.21s
 
 ▼ code 
 ▼ output
- ▶ uv-logs
+ ▶ uv-logs
  | 
-Cell: benchmark | 3.86s
+Cell: benchmark | 7.58s
  | 
 
 Raw
-GitHub
+GitHub
 
@@ -4234,27 +4234,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.038ms 1165.07% 1.038ms 1.038ms 1 - torch_eager 14.25% 384.344us 99.73% 2.691ms 2.691ms 0.000us 0.00% 90.272us 90.272us 1 - aten::mul 6.11% 164.889us 10.39% 280.433us 11.685us 46.752us 52.50% 46.752us 1.948us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 46.752us 52.50% 46.752us 1.948us 24 - aten::copy_ 4.15% 111.919us 62.66% 1.690ms 93.917us 29.025us 32.59% 30.240us 1.680us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.306us 25.05% 22.306us 1.859us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.280us 14.91% 13.280us 1.107us 12 - aten::clone 1.43% 38.559us 61.06% 1.647ms 274.577us 0.000us 0.00% 7.934us 1.322us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.719us 7.54% 6.719us 1.120us 6 - aten::sub 1.59% 42.770us 2.55% 68.721us 11.454us 6.688us 7.51% 6.688us 1.115us 6 - aten::add 1.63% 44.070us 2.49% 67.170us 11.195us 6.592us 7.40% 6.592us 1.099us 6 - Activity Buffer Request 53.52% 1.444ms 53.52% 1.444ms 1.444ms 1.215us 1.36% 1.215us 1.215us 1 - aten::empty_strided 2.14% 57.723us 2.14% 57.723us 9.620us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 2.62% 70.572us 2.62% 70.572us 11.762us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.99% 80.691us 3.82% 103.161us 4.298us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.83% 22.470us 0.83% 22.470us 0.936us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 8.47% 228.526us 8.47% 228.526us 4.761us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.27% 7.361us 0.27% 7.361us 7.361us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.068ms 1195.27% 1.068ms 1.068ms 1 + torch_eager 14.00% 388.140us 99.71% 2.764ms 2.764ms 0.000us 0.00% 90.528us 90.528us 1 + aten::mul 6.16% 170.676us 10.43% 289.217us 12.051us 46.911us 52.52% 46.911us 1.955us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 46.911us 52.52% 46.911us 1.955us 24 + aten::copy_ 4.25% 117.935us 62.65% 1.737ms 96.500us 29.185us 32.68% 30.401us 1.689us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.561us 25.26% 22.561us 1.880us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.216us 14.80% 13.216us 1.101us 12 + aten::clone 1.62% 44.961us 61.78% 1.713ms 285.451us 0.000us 0.00% 7.840us 1.307us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.624us 7.42% 6.624us 1.104us 6 + aten::sub 1.59% 44.071us 2.54% 70.301us 11.717us 6.624us 7.42% 6.624us 1.104us 6 + aten::add 1.26% 34.801us 2.08% 57.721us 9.620us 6.592us 7.38% 6.592us 1.099us 6 + Activity Buffer Request 53.17% 1.474ms 53.17% 1.474ms 1.474ms 1.216us 1.36% 1.216us 1.216us 1 + aten::empty_strided 2.35% 65.251us 2.35% 65.251us 10.875us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 2.98% 82.752us 2.98% 82.752us 13.792us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.05% 84.591us 4.03% 111.694us 4.654us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.98% 27.103us 0.98% 27.103us 1.129us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 8.29% 229.882us 8.29% 229.882us 4.789us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.29% 8.120us 0.29% 8.120us 8.120us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.698ms -Self CUDA time total: 89.057us +Self CPU time total: 2.772ms +Self CUDA time total: 89.312us @@ -4264,27 +4264,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 960.319us 1064.55% 960.319us 960.319us 1 - torch_eager 12.91% 327.841us 99.79% 2.533ms 2.533ms 0.000us 0.00% 91.361us 91.361us 1 - aten::mul 6.09% 154.573us 10.36% 263.046us 10.960us 47.616us 52.78% 47.616us 1.984us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 47.616us 52.78% 47.616us 1.984us 24 - aten::copy_ 4.38% 111.264us 65.83% 1.671ms 92.839us 29.313us 32.49% 30.465us 1.692us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.496us 24.94% 22.496us 1.875us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.280us 14.72% 13.280us 1.107us 12 - aten::clone 1.07% 27.110us 62.73% 1.592ms 265.408us 0.000us 0.00% 7.969us 1.328us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.817us 7.56% 6.817us 1.136us 6 - aten::sub 1.66% 42.072us 2.63% 66.652us 11.109us 6.688us 7.41% 6.688us 1.115us 6 - aten::add 1.28% 32.560us 2.18% 55.291us 9.215us 6.592us 7.31% 6.592us 1.099us 6 - Activity Buffer Request 56.87% 1.444ms 56.87% 1.444ms 1.444ms 1.152us 1.28% 1.152us 1.152us 1 - aten::empty_strided 1.25% 31.671us 1.25% 31.671us 5.278us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 2.12% 53.780us 2.12% 53.780us 8.963us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.76% 70.023us 3.57% 90.653us 3.777us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.81% 20.630us 0.81% 20.630us 0.860us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 8.59% 218.025us 8.59% 218.025us 4.542us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.21% 5.289us 0.21% 5.289us 5.289us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 960.345us 1063.10% 960.345us 960.345us 1 + torch_eager 11.94% 304.272us 99.78% 2.543ms 2.543ms 0.000us 0.00% 91.454us 91.454us 1 + aten::mul 6.19% 157.625us 10.77% 274.398us 11.433us 47.776us 52.89% 47.776us 1.991us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 47.776us 52.89% 47.776us 1.991us 24 + aten::copy_ 4.14% 105.392us 66.58% 1.697ms 94.258us 29.343us 32.48% 30.463us 1.692us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.559us 24.97% 22.559us 1.880us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.215us 14.63% 13.215us 1.101us 12 + aten::clone 0.97% 24.733us 63.76% 1.625ms 270.825us 0.000us 0.00% 7.904us 1.317us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.784us 7.51% 6.784us 1.131us 6 + aten::add 1.23% 31.452us 2.12% 54.072us 9.012us 6.623us 7.33% 6.623us 1.104us 6 + aten::sub 1.53% 39.032us 2.55% 64.964us 10.827us 6.592us 7.30% 6.592us 1.099us 6 + Activity Buffer Request 57.59% 1.468ms 57.59% 1.468ms 1.468ms 1.120us 1.24% 1.120us 1.120us 1 + aten::empty_strided 1.31% 33.410us 1.31% 33.410us 5.568us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 2.43% 61.963us 2.43% 61.963us 10.327us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.76% 70.222us 3.54% 90.271us 3.761us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.79% 20.049us 0.79% 20.049us 0.835us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 8.91% 226.937us 8.91% 226.937us 4.728us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.22% 5.590us 0.22% 5.590us 5.590us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.539ms -Self CUDA time total: 90.209us +Self CPU time total: 2.548ms +Self CUDA time total: 90.334us @@ -4294,27 +4294,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 950.812us 1013.41% 950.812us 950.812us 1 - torch_eager 12.58% 319.124us 99.78% 2.531ms 2.531ms 0.000us 0.00% 95.135us 95.135us 1 - aten::mul 6.09% 154.550us 10.34% 262.291us 10.929us 48.671us 51.88% 48.671us 2.028us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 48.671us 51.88% 48.671us 2.028us 24 - aten::copy_ 4.10% 104.029us 66.32% 1.682ms 93.470us 30.783us 32.81% 32.095us 1.783us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.943us 24.45% 22.943us 1.912us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.369us 15.32% 14.369us 1.197us 12 - aten::clone 1.04% 26.300us 63.34% 1.607ms 267.803us 0.000us 0.00% 9.152us 1.525us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.840us 8.36% 7.840us 1.307us 6 - aten::sub 1.64% 41.492us 2.64% 66.953us 11.159us 7.199us 7.67% 7.199us 1.200us 6 - aten::add 1.26% 31.999us 2.14% 54.310us 9.052us 7.170us 7.64% 7.170us 1.195us 6 - Activity Buffer Request 57.64% 1.462ms 57.64% 1.462ms 1.462ms 1.312us 1.40% 1.312us 1.312us 1 - aten::empty_strided 1.26% 31.840us 1.26% 31.840us 5.307us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 2.05% 52.102us 2.05% 52.102us 8.684us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.68% 67.986us 3.47% 87.958us 3.665us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.79% 19.972us 0.79% 19.972us 0.832us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 8.65% 219.475us 8.65% 219.475us 4.572us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.22% 5.651us 0.22% 5.651us 5.651us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 989.616us 1051.23% 989.616us 989.616us 1 + torch_eager 12.09% 307.194us 99.76% 2.536ms 2.536ms 0.000us 0.00% 95.450us 95.450us 1 + aten::mul 6.35% 161.494us 11.09% 281.865us 11.744us 48.958us 52.01% 48.958us 2.040us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 48.958us 52.01% 48.958us 2.040us 24 + aten::copy_ 4.30% 109.293us 66.10% 1.680ms 93.343us 30.814us 32.73% 32.125us 1.785us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.943us 24.37% 22.943us 1.912us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.367us 15.26% 14.367us 1.197us 12 + aten::clone 0.97% 24.599us 62.75% 1.595ms 265.823us 0.000us 0.00% 9.182us 1.530us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.871us 8.36% 7.871us 1.312us 6 + aten::add 1.20% 30.579us 2.08% 52.891us 8.815us 7.199us 7.65% 7.199us 1.200us 6 + aten::sub 1.49% 37.871us 2.53% 64.231us 10.705us 7.168us 7.61% 7.168us 1.195us 6 + Activity Buffer Request 56.57% 1.438ms 56.57% 1.438ms 1.438ms 1.311us 1.39% 1.311us 1.311us 1 + aten::empty_strided 1.38% 35.041us 1.38% 35.041us 5.840us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 2.38% 60.441us 2.38% 60.441us 10.074us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.77% 70.298us 3.53% 89.841us 3.743us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.77% 19.543us 0.77% 19.543us 0.814us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 9.50% 241.544us 9.50% 241.544us 5.032us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.24% 6.100us 0.24% 6.100us 6.100us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.537ms -Self CUDA time total: 93.823us +Self CPU time total: 2.542ms +Self CUDA time total: 94.139us @@ -4324,27 +4324,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 952.670us 942.15% 952.670us 952.670us 1 - torch_eager 11.55% 312.506us 99.79% 2.701ms 2.701ms 0.000us 0.00% 102.429us 102.429us 1 - aten::mul 5.68% 153.743us 9.71% 262.695us 10.946us 52.765us 52.18% 52.765us 2.199us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 52.765us 52.18% 52.765us 2.199us 24 - aten::copy_ 3.97% 107.471us 68.61% 1.857ms 103.165us 32.353us 32.00% 33.665us 1.870us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.641us 24.37% 24.641us 2.053us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 15.999us 15.82% 15.999us 1.333us 12 - aten::clone 1.01% 27.330us 65.76% 1.780ms 296.625us 0.000us 0.00% 9.024us 1.504us 6 - aten::add 1.21% 32.850us 2.05% 55.600us 9.267us 8.032us 7.94% 8.032us 1.339us 6 - aten::sub 1.44% 39.082us 2.35% 63.492us 10.582us 7.967us 7.88% 7.967us 1.328us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.712us 7.63% 7.712us 1.285us 6 - Activity Buffer Request 52.99% 1.434ms 52.99% 1.434ms 1.434ms 1.312us 1.30% 1.312us 1.312us 1 - aten::empty_strided 1.20% 32.420us 1.20% 32.420us 5.403us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 9.27% 250.924us 9.27% 250.924us 41.821us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.56% 69.212us 3.32% 89.782us 3.741us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.76% 20.570us 0.76% 20.570us 0.857us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 8.14% 220.374us 8.14% 220.374us 4.591us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.21% 5.791us 0.21% 5.791us 5.791us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 928.327us 916.02% 928.327us 928.327us 1 + torch_eager 12.51% 290.049us 99.77% 2.313ms 2.313ms 0.000us 0.00% 102.689us 102.689us 1 + aten::mul 6.36% 147.401us 11.12% 257.946us 10.748us 52.800us 52.10% 52.800us 2.200us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 52.800us 52.10% 52.800us 2.200us 24 + aten::copy_ 4.62% 107.204us 65.04% 1.508ms 83.777us 32.415us 31.99% 33.760us 1.876us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.511us 24.19% 24.511us 2.043us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.129us 15.92% 16.129us 1.344us 12 + aten::clone 0.98% 22.822us 61.74% 1.431ms 238.579us 0.000us 0.00% 9.249us 1.542us 6 + aten::add 1.37% 31.668us 2.34% 54.320us 9.053us 8.096us 7.99% 8.096us 1.349us 6 + aten::sub 1.57% 36.291us 2.61% 60.431us 10.072us 8.033us 7.93% 8.033us 1.339us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.904us 7.80% 7.904us 1.317us 6 + Activity Buffer Request 46.02% 1.067ms 46.02% 1.067ms 1.067ms 1.345us 1.33% 1.345us 1.345us 1 + aten::empty_strided 1.38% 31.940us 1.38% 31.940us 5.323us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 11.71% 271.508us 11.71% 271.508us 45.251us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.99% 69.429us 3.79% 87.781us 3.658us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.79% 18.352us 0.79% 18.352us 0.765us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 9.47% 219.548us 9.47% 219.548us 4.574us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.23% 5.380us 0.23% 5.380us 5.380us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.707ms -Self CUDA time total: 101.117us +Self CPU time total: 2.319ms +Self CUDA time total: 101.344us @@ -4354,27 +4354,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 987.399us 1051.70% 987.399us 987.399us 1 - torch_eager 12.37% 335.778us 99.82% 2.710ms 2.710ms 0.000us 0.00% 95.198us 95.198us 1 - aten::mul 5.74% 155.881us 9.81% 266.305us 11.096us 48.927us 52.11% 48.927us 2.039us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 48.927us 52.11% 48.927us 2.039us 24 - aten::copy_ 3.95% 107.229us 67.43% 1.830ms 101.693us 30.753us 32.76% 32.065us 1.781us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.881us 24.37% 22.881us 1.907us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.206us 15.13% 14.206us 1.184us 12 - aten::clone 0.99% 26.953us 64.69% 1.756ms 292.683us 0.000us 0.00% 9.184us 1.531us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.872us 8.38% 7.872us 1.312us 6 - aten::add 1.25% 33.910us 2.11% 57.361us 9.560us 7.103us 7.57% 7.103us 1.184us 6 - aten::sub 1.62% 44.010us 2.55% 69.231us 11.538us 7.103us 7.57% 7.103us 1.184us 6 - Activity Buffer Request 53.49% 1.452ms 53.49% 1.452ms 1.452ms 1.312us 1.40% 1.312us 1.312us 1 - aten::empty_strided 1.24% 33.730us 1.24% 33.730us 5.622us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 7.66% 207.874us 7.66% 207.874us 34.646us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.54% 68.958us 3.31% 89.820us 3.743us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.77% 20.862us 0.77% 20.862us 0.869us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 8.19% 222.327us 8.19% 222.327us 4.632us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.18% 5.000us 0.18% 5.000us 5.000us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.018ms 1082.59% 1.018ms 1.018ms 1 + torch_eager 11.47% 329.955us 99.81% 2.870ms 2.870ms 0.000us 0.00% 95.358us 95.358us 1 + aten::mul 5.65% 162.614us 9.86% 283.677us 11.820us 49.056us 52.16% 49.056us 2.044us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 49.056us 52.16% 49.056us 2.044us 24 + aten::copy_ 3.88% 111.664us 68.17% 1.960ms 108.907us 30.720us 32.66% 32.032us 1.780us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.880us 24.33% 22.880us 1.907us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.270us 15.17% 14.270us 1.189us 12 + aten::clone 1.07% 30.831us 65.73% 1.890ms 315.021us 0.000us 0.00% 9.152us 1.525us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.840us 8.34% 7.840us 1.307us 6 + aten::add 1.15% 33.191us 2.07% 59.441us 9.907us 7.167us 7.62% 7.167us 1.194us 6 + aten::sub 1.59% 45.863us 2.59% 74.463us 12.411us 7.103us 7.55% 7.103us 1.184us 6 + Activity Buffer Request 50.07% 1.440ms 50.07% 1.440ms 1.440ms 1.312us 1.40% 1.312us 1.312us 1 + aten::empty_strided 1.26% 36.310us 1.26% 36.310us 6.052us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 11.96% 343.839us 11.96% 343.839us 57.306us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.64% 75.860us 3.31% 95.264us 3.969us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.67% 19.404us 0.67% 19.404us 0.809us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 8.38% 240.995us 8.38% 240.995us 5.021us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.19% 5.330us 0.19% 5.330us 5.330us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.715ms -Self CUDA time total: 93.886us +Self CPU time total: 2.876ms +Self CUDA time total: 94.046us @@ -4384,27 +4384,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 939.771us 930.81% 939.771us 939.771us 1 - torch_eager 11.42% 294.218us 99.78% 2.570ms 2.570ms 0.000us 0.00% 102.276us 102.276us 1 - aten::mul 5.85% 150.653us 10.08% 259.594us 10.816us 52.609us 52.11% 52.609us 2.192us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 52.609us 52.11% 52.609us 2.192us 24 - aten::copy_ 4.01% 103.273us 68.02% 1.752ms 97.337us 32.450us 32.14% 33.763us 1.876us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.640us 24.40% 24.640us 2.053us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 15.904us 15.75% 15.904us 1.325us 12 - aten::clone 0.87% 22.360us 64.99% 1.674ms 278.983us 0.000us 0.00% 9.123us 1.520us 6 - aten::sub 1.58% 40.669us 2.53% 65.240us 10.873us 7.968us 7.89% 7.968us 1.328us 6 - aten::add 1.32% 33.930us 2.20% 56.580us 9.430us 7.936us 7.86% 7.936us 1.323us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.810us 7.74% 7.810us 1.302us 6 - Activity Buffer Request 54.28% 1.398ms 54.28% 1.398ms 1.398ms 1.313us 1.30% 1.313us 1.313us 1 - aten::empty_strided 1.21% 31.291us 1.21% 31.291us 5.215us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 7.34% 188.943us 7.34% 188.943us 31.491us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.69% 69.330us 3.44% 88.671us 3.695us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.75% 19.341us 0.75% 19.341us 0.806us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 8.46% 218.003us 8.46% 218.003us 4.542us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.22% 5.651us 0.22% 5.651us 5.651us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 913.335us 900.40% 913.335us 913.335us 1 + torch_eager 10.58% 290.726us 99.81% 2.742ms 2.742ms 0.000us 0.00% 102.781us 102.781us 1 + aten::mul 5.30% 145.663us 9.31% 255.637us 10.652us 52.735us 51.99% 52.735us 2.197us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 52.735us 51.99% 52.735us 2.197us 24 + aten::copy_ 3.74% 102.751us 70.53% 1.937ms 107.622us 32.638us 32.18% 33.982us 1.888us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.735us 24.38% 24.735us 2.061us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.064us 15.84% 16.064us 1.339us 12 + aten::clone 0.88% 24.121us 67.96% 1.867ms 311.110us 0.000us 0.00% 9.247us 1.541us 6 + aten::sub 1.29% 35.411us 2.16% 59.202us 9.867us 8.033us 7.92% 8.033us 1.339us 6 + aten::add 1.13% 30.931us 1.93% 52.952us 8.825us 8.031us 7.92% 8.031us 1.339us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.903us 7.79% 7.903us 1.317us 6 + Activity Buffer Request 52.85% 1.452ms 52.85% 1.452ms 1.452ms 1.344us 1.32% 1.344us 1.344us 1 + aten::empty_strided 1.21% 33.351us 1.21% 33.351us 5.559us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 11.71% 321.577us 11.71% 321.577us 53.596us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.55% 69.990us 3.22% 88.522us 3.688us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.67% 18.532us 0.67% 18.532us 0.772us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 7.90% 216.969us 7.90% 216.969us 4.520us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.19% 5.091us 0.19% 5.091us 5.091us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.576ms -Self CUDA time total: 100.963us +Self CPU time total: 2.747ms +Self CUDA time total: 101.437us @@ -4414,27 +4414,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 987.019us 820.52% 987.019us 987.019us 1 - torch_eager 11.12% 293.915us 99.79% 2.637ms 2.637ms 0.000us 0.00% 122.116us 122.116us 1 - aten::mul 6.22% 164.251us 10.48% 276.937us 11.539us 61.922us 51.48% 61.922us 2.580us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 61.922us 51.48% 61.922us 2.580us 24 - aten::copy_ 3.96% 104.584us 67.08% 1.772ms 98.461us 39.265us 32.64% 41.089us 2.283us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.833us 23.97% 28.833us 2.403us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.105us 15.88% 19.105us 1.592us 12 - aten::clone 0.81% 21.321us 64.15% 1.695ms 282.483us 0.000us 0.00% 12.256us 2.043us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.432us 8.67% 10.432us 1.739us 6 - aten::sub 1.58% 41.691us 2.56% 67.622us 11.270us 9.569us 7.95% 9.569us 1.595us 6 - aten::add 1.31% 34.540us 2.17% 57.381us 9.563us 9.536us 7.93% 9.536us 1.589us 6 - Activity Buffer Request 53.87% 1.423ms 53.87% 1.423ms 1.423ms 1.824us 1.52% 1.824us 1.824us 1 - aten::empty_strided 1.17% 30.940us 1.17% 30.940us 5.157us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 6.97% 184.193us 6.97% 184.193us 30.699us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.59% 94.920us 4.40% 116.150us 4.840us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.80% 21.230us 0.80% 21.230us 0.885us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 8.38% 221.517us 8.38% 221.517us 4.615us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.21% 5.631us 0.21% 5.631us 5.631us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 929.433us 768.61% 929.433us 929.433us 1 + torch_eager 10.84% 297.701us 99.80% 2.742ms 2.742ms 0.000us 0.00% 122.716us 122.716us 1 + aten::mul 5.42% 148.850us 9.41% 258.632us 10.776us 62.014us 51.28% 62.014us 2.584us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 62.014us 51.28% 62.014us 2.584us 24 + aten::copy_ 3.77% 103.682us 70.14% 1.927ms 107.043us 39.328us 32.52% 41.120us 2.284us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.800us 23.82% 28.800us 2.400us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.582us 16.19% 19.582us 1.632us 12 + aten::clone 0.88% 24.131us 67.45% 1.853ms 308.828us 0.000us 0.00% 12.320us 2.053us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.528us 8.71% 10.528us 1.755us 6 + aten::sub 1.29% 35.482us 2.16% 59.433us 9.905us 9.792us 8.10% 9.792us 1.632us 6 + aten::add 1.13% 31.104us 1.94% 53.172us 8.862us 9.790us 8.10% 9.790us 1.632us 6 + Activity Buffer Request 52.94% 1.454ms 52.94% 1.454ms 1.454ms 1.792us 1.48% 1.792us 1.792us 1 + aten::empty_strided 1.18% 32.542us 1.18% 32.542us 5.424us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 11.19% 307.407us 11.19% 307.407us 51.235us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.56% 70.268us 3.25% 89.361us 3.723us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.70% 19.093us 0.70% 19.093us 0.796us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 7.91% 217.262us 7.91% 217.262us 4.526us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.20% 5.370us 0.20% 5.370us 5.370us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.642ms -Self CUDA time total: 120.292us +Self CPU time total: 2.747ms +Self CUDA time total: 120.924us @@ -4444,27 +4444,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 942.977us 547.62% 942.977us 942.977us 1 - torch_eager 11.98% 313.186us 99.77% 2.608ms 2.608ms 0.000us 0.00% 175.043us 175.043us 1 - aten::mul 5.92% 154.664us 10.07% 263.135us 10.964us 89.731us 52.11% 89.731us 3.739us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 89.731us 52.11% 89.731us 3.739us 24 - aten::copy_ 4.21% 110.022us 67.75% 1.771ms 98.397us 57.632us 33.47% 60.480us 3.360us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.640us 23.60% 40.640us 3.387us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.832us 14.42% 24.832us 2.069us 12 - aten::clone 1.00% 26.050us 64.65% 1.690ms 281.685us 0.000us 0.00% 19.840us 3.307us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.992us 9.87% 16.992us 2.832us 6 - aten::add 1.22% 32.012us 2.08% 54.302us 9.050us 12.416us 7.21% 12.416us 2.069us 6 - aten::sub 1.48% 38.721us 2.41% 62.881us 10.480us 12.416us 7.21% 12.416us 2.069us 6 - Activity Buffer Request 54.20% 1.417ms 54.20% 1.417ms 1.417ms 2.848us 1.65% 2.848us 2.848us 1 - aten::empty_strided 1.15% 30.180us 1.15% 30.180us 5.030us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 6.98% 182.574us 6.98% 182.574us 30.429us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.56% 66.979us 3.34% 87.351us 3.640us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.78% 20.372us 0.78% 20.372us 0.849us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 8.28% 216.491us 8.28% 216.491us 4.510us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.23% 5.900us 0.23% 5.900us 5.900us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 942.082us 549.37% 942.082us 942.082us 1 + torch_eager 20.10% 308.752us 99.67% 1.531ms 1.531ms 0.000us 0.00% 174.365us 174.365us 1 + aten::mul 9.79% 150.414us 16.96% 260.516us 10.855us 89.056us 51.93% 89.056us 3.711us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 89.056us 51.93% 89.056us 3.711us 24 + aten::copy_ 6.91% 106.224us 46.22% 710.060us 39.448us 57.503us 33.53% 60.383us 3.355us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.511us 23.62% 40.511us 3.376us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.926us 14.54% 24.926us 2.077us 12 + aten::clone 1.37% 21.029us 40.87% 627.796us 104.633us 0.000us 0.00% 19.872us 3.312us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.992us 9.91% 16.992us 2.832us 6 + aten::sub 2.26% 34.730us 3.83% 58.781us 9.797us 12.479us 7.28% 12.479us 2.080us 6 + aten::add 2.00% 30.683us 3.45% 52.973us 8.829us 12.447us 7.26% 12.447us 2.075us 6 + Activity Buffer Request 16.15% 248.056us 16.15% 248.056us 248.056us 2.880us 1.68% 2.880us 2.880us 1 + aten::empty_strided 2.04% 31.392us 2.04% 31.392us 5.232us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 18.97% 291.479us 18.97% 291.479us 48.580us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.49% 68.986us 5.70% 87.586us 3.649us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.21% 18.600us 1.21% 18.600us 0.775us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 14.37% 220.744us 14.37% 220.744us 4.599us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.33% 5.080us 0.33% 5.080us 5.080us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.614ms -Self CUDA time total: 172.195us +Self CPU time total: 1.536ms +Self CUDA time total: 171.485us @@ -4474,27 +4474,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 954.334us 791.88% 954.334us 954.334us 1 - torch_eager 21.12% 286.823us 99.60% 1.352ms 1.352ms 0.000us 0.00% 122.339us 122.339us 1 - aten::mul 11.39% 154.733us 19.43% 263.854us 10.994us 61.889us 51.35% 61.889us 2.579us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 61.889us 51.35% 61.889us 2.579us 24 - aten::copy_ 8.06% 109.392us 38.94% 528.759us 29.376us 39.393us 32.69% 41.217us 2.290us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.864us 23.95% 28.864us 2.405us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.233us 15.96% 19.233us 1.603us 12 - aten::clone 1.54% 20.901us 32.67% 443.638us 73.940us 0.000us 0.00% 12.353us 2.059us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.529us 8.74% 10.529us 1.755us 6 - aten::sub 2.93% 39.731us 4.81% 65.293us 10.882us 9.633us 7.99% 9.633us 1.606us 6 - aten::add 2.54% 34.552us 4.77% 64.792us 10.799us 9.600us 7.97% 9.600us 1.600us 6 - Activity Buffer Request 12.72% 172.763us 12.72% 172.763us 172.763us 1.824us 1.51% 1.824us 1.824us 1 - aten::empty_strided 2.32% 31.561us 2.32% 31.561us 5.260us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 13.45% 182.623us 13.45% 182.623us 30.437us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 5.17% 70.140us 6.66% 90.481us 3.770us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.50% 20.341us 1.50% 20.341us 0.848us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 16.86% 228.904us 16.86% 228.904us 4.769us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.40% 5.490us 0.40% 5.490us 5.490us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 906.096us 748.31% 906.096us 906.096us 1 + torch_eager 18.91% 280.775us 99.66% 1.480ms 1.480ms 0.000us 0.00% 122.910us 122.910us 1 + aten::mul 10.01% 148.664us 17.45% 259.167us 10.799us 62.174us 51.35% 62.174us 2.591us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 62.174us 51.35% 62.174us 2.591us 24 + aten::copy_ 6.88% 102.100us 46.50% 690.526us 38.363us 39.392us 32.53% 41.216us 2.290us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.800us 23.78% 28.800us 2.400us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.520us 16.12% 19.520us 1.627us 12 + aten::clone 1.45% 21.579us 41.36% 614.176us 102.363us 0.000us 0.00% 12.416us 2.069us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.592us 8.75% 10.592us 1.765us 6 + aten::sub 2.32% 34.432us 3.90% 57.973us 9.662us 9.760us 8.06% 9.760us 1.627us 6 + aten::add 2.12% 31.432us 3.61% 53.552us 8.925us 9.760us 8.06% 9.760us 1.627us 6 + Activity Buffer Request 17.05% 253.136us 17.05% 253.136us 253.136us 1.824us 1.51% 1.824us 1.824us 1 + aten::empty_strided 2.06% 30.533us 2.06% 30.533us 5.089us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 18.50% 274.717us 18.50% 274.717us 45.786us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.53% 67.311us 5.78% 85.812us 3.575us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.25% 18.501us 1.25% 18.501us 0.771us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 14.60% 216.737us 14.60% 216.737us 4.515us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.34% 4.981us 0.34% 4.981us 4.981us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.358ms -Self CUDA time total: 120.515us +Self CPU time total: 1.485ms +Self CUDA time total: 121.086us @@ -4504,27 +4504,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 961.439us 559.06% 961.439us 961.439us 1 - torch_eager 21.39% 301.083us 99.65% 1.403ms 1.403ms 0.000us 0.00% 174.821us 174.821us 1 - aten::mul 10.92% 153.723us 18.79% 264.437us 11.018us 89.541us 52.07% 89.541us 3.731us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 89.541us 52.07% 89.541us 3.731us 24 - aten::copy_ 8.57% 120.662us 41.11% 578.630us 32.146us 57.631us 33.51% 60.479us 3.360us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.639us 23.63% 40.639us 3.387us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.801us 14.42% 24.801us 2.067us 12 - aten::clone 1.49% 21.022us 33.99% 478.490us 79.748us 0.000us 0.00% 19.840us 3.307us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.992us 9.88% 16.992us 2.832us 6 - aten::add 2.26% 31.841us 3.85% 54.131us 9.022us 12.481us 7.26% 12.481us 2.080us 6 - aten::sub 2.79% 39.260us 4.52% 63.691us 10.615us 12.320us 7.16% 12.320us 2.053us 6 - Activity Buffer Request 15.02% 211.404us 15.02% 211.404us 211.404us 2.848us 1.66% 2.848us 2.848us 1 - aten::empty_strided 2.10% 29.500us 2.10% 29.500us 4.917us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 13.01% 183.184us 13.01% 183.184us 30.531us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.96% 69.812us 6.41% 90.211us 3.759us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.45% 20.399us 1.45% 20.399us 0.850us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 15.69% 220.815us 15.69% 220.815us 4.600us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.35% 4.890us 0.35% 4.890us 4.890us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 954.294us 555.32% 954.294us 954.294us 1 + torch_eager 11.21% 307.269us 99.82% 2.735ms 2.735ms 0.000us 0.00% 174.694us 174.694us 1 + aten::mul 5.59% 153.258us 9.69% 265.580us 11.066us 89.476us 52.07% 89.476us 3.728us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 89.476us 52.07% 89.476us 3.728us 24 + aten::copy_ 3.78% 103.631us 69.46% 1.903ms 105.735us 57.505us 33.46% 60.353us 3.353us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.545us 23.59% 40.545us 3.379us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.865us 14.47% 24.865us 2.072us 12 + aten::clone 0.89% 24.491us 66.72% 1.828ms 304.733us 0.000us 0.00% 19.808us 3.301us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.960us 9.87% 16.960us 2.827us 6 + aten::add 1.15% 31.480us 1.96% 53.761us 8.960us 12.448us 7.24% 12.448us 2.075us 6 + aten::sub 1.31% 35.801us 2.17% 59.462us 9.910us 12.417us 7.23% 12.417us 2.070us 6 + Activity Buffer Request 53.91% 1.477ms 53.91% 1.477ms 1.477ms 2.848us 1.66% 2.848us 2.848us 1 + aten::empty_strided 1.13% 30.930us 1.13% 30.930us 5.155us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 9.51% 260.666us 9.51% 260.666us 43.444us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.58% 70.761us 3.30% 90.449us 3.769us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.72% 19.688us 0.72% 19.688us 0.820us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 8.03% 220.086us 8.03% 220.086us 4.585us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.18% 5.030us 0.18% 5.030us 5.030us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.408ms -Self CUDA time total: 171.973us +Self CPU time total: 2.740ms +Self CUDA time total: 171.846us @@ -4534,27 +4534,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 959.740us 338.81% 959.740us 959.740us 1 - torch_eager 11.78% 309.495us 99.81% 2.622ms 2.622ms 0.000us 0.00% 301.248us 301.248us 1 - aten::mul 5.80% 152.430us 9.98% 262.294us 10.929us 133.378us 47.09% 133.378us 5.557us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 133.378us 47.09% 133.378us 5.557us 24 - aten::copy_ 4.09% 107.511us 67.37% 1.770ms 98.338us 108.832us 38.42% 126.816us 7.045us 18 - aten::clone 1.07% 28.041us 64.54% 1.696ms 282.603us 0.000us 0.00% 69.600us 11.600us 6 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.216us 20.20% 57.216us 4.768us 12 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 51.616us 18.22% 51.616us 8.603us 6 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 41.054us 14.49% 41.054us 3.421us 12 - aten::sub 1.57% 41.190us 2.52% 66.080us 11.013us 20.607us 7.27% 20.607us 3.434us 6 - aten::add 1.56% 40.972us 2.46% 64.512us 10.752us 20.447us 7.22% 20.447us 3.408us 6 - Activity Buffer Request 53.79% 1.413ms 53.79% 1.413ms 1.413ms 17.984us 6.35% 17.984us 17.984us 1 - aten::empty_strided 1.19% 31.311us 1.19% 31.311us 5.218us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 7.14% 187.713us 7.14% 187.713us 31.285us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.66% 69.760us 3.44% 90.282us 3.762us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.78% 20.522us 0.78% 20.522us 0.855us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 8.37% 219.936us 8.37% 219.936us 4.582us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.19% 5.111us 0.19% 5.111us 5.111us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 917.943us 324.46% 917.943us 917.943us 1 + torch_eager 18.90% 277.703us 99.65% 1.464ms 1.464ms 0.000us 0.00% 301.376us 301.376us 1 + aten::mul 9.84% 144.586us 17.44% 256.139us 10.672us 132.736us 46.92% 132.736us 5.531us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 132.736us 46.92% 132.736us 5.531us 24 + aten::copy_ 7.06% 103.765us 45.63% 670.307us 37.239us 109.119us 38.57% 127.583us 7.088us 18 + aten::clone 1.58% 23.262us 40.78% 599.096us 99.849us 0.000us 0.00% 70.336us 11.723us 6 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.247us 20.23% 57.247us 4.771us 12 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 51.872us 18.34% 51.872us 8.645us 6 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 41.057us 14.51% 41.057us 3.421us 12 + aten::add 2.13% 31.271us 3.65% 53.632us 8.939us 20.545us 7.26% 20.545us 3.424us 6 + aten::sub 2.39% 35.109us 4.06% 59.711us 9.952us 20.512us 7.25% 20.512us 3.419us 6 + Activity Buffer Request 16.07% 236.106us 16.07% 236.106us 236.106us 18.464us 6.53% 18.464us 18.464us 1 + aten::empty_strided 2.35% 34.500us 2.35% 34.500us 5.750us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 18.36% 269.767us 18.36% 269.767us 44.961us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.78% 70.183us 6.04% 88.753us 3.698us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.26% 18.570us 1.26% 18.570us 0.774us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 14.92% 219.185us 14.92% 219.185us 4.566us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.35% 5.090us 0.35% 5.090us 5.090us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.627ms -Self CUDA time total: 283.264us +Self CPU time total: 1.469ms +Self CUDA time total: 282.912us @@ -4564,27 +4564,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 964.299us 170.17% 964.299us 964.299us 1 - torch_eager 21.37% 289.253us 99.58% 1.348ms 1.348ms 0.000us 0.00% 590.419us 590.419us 1 - aten::copy_ 7.69% 104.123us 37.93% 513.450us 28.525us 274.106us 48.37% 297.849us 16.547us 18 - aten::mul 11.75% 159.118us 20.07% 271.705us 11.321us 226.427us 39.96% 226.427us 9.434us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 226.427us 39.96% 226.427us 9.434us 24 - aten::clone 1.55% 21.020us 32.53% 440.358us 73.393us 0.000us 0.00% 206.843us 34.474us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 183.100us 32.31% 183.100us 30.517us 6 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 91.006us 16.06% 91.006us 7.584us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 66.143us 11.67% 66.143us 5.512us 12 - aten::sub 3.06% 41.432us 4.99% 67.562us 11.260us 33.664us 5.94% 33.664us 5.611us 6 - aten::add 2.43% 32.930us 4.17% 56.451us 9.408us 32.479us 5.73% 32.479us 5.413us 6 - Activity Buffer Request 11.95% 161.793us 11.95% 161.793us 161.793us 23.743us 4.19% 23.743us 23.743us 1 - aten::empty_strided 2.85% 38.611us 2.85% 38.611us 6.435us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 13.59% 183.934us 13.59% 183.934us 30.656us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 5.13% 69.460us 6.64% 89.941us 3.748us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.51% 20.481us 1.51% 20.481us 0.853us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 16.68% 225.838us 16.68% 225.838us 4.705us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.42% 5.710us 0.42% 5.710us 5.710us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 931.832us 165.35% 931.832us 931.832us 1 + torch_eager 19.27% 283.137us 99.64% 1.464ms 1.464ms 0.000us 0.00% 587.261us 587.261us 1 + aten::copy_ 7.04% 103.435us 44.90% 659.587us 36.644us 272.511us 48.36% 296.223us 16.457us 18 + aten::mul 10.36% 152.225us 18.18% 267.110us 11.130us 224.829us 39.90% 224.829us 9.368us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 224.829us 39.90% 224.829us 9.368us 24 + aten::clone 1.47% 21.550us 39.53% 580.673us 96.779us 0.000us 0.00% 205.855us 34.309us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 182.143us 32.32% 182.143us 30.357us 6 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 90.368us 16.04% 90.368us 7.531us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 66.209us 11.75% 66.209us 5.517us 12 + aten::sub 2.39% 35.041us 4.07% 59.831us 9.972us 33.760us 5.99% 33.760us 5.627us 6 + aten::add 2.15% 31.591us 3.70% 54.401us 9.067us 32.449us 5.76% 32.449us 5.408us 6 + Activity Buffer Request 16.23% 238.406us 16.23% 238.406us 238.406us 23.712us 4.21% 23.712us 23.712us 1 + aten::empty_strided 2.04% 29.960us 2.04% 29.960us 4.993us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 17.39% 255.475us 17.39% 255.475us 42.579us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.73% 69.441us 6.00% 88.092us 3.670us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.27% 18.651us 1.27% 18.651us 0.777us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 15.30% 224.756us 15.30% 224.756us 4.682us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.36% 5.280us 0.36% 5.280us 5.280us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.354ms -Self CUDA time total: 566.676us +Self CPU time total: 1.469ms +Self CUDA time total: 563.549us @@ -4594,27 +4594,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 940.757us 1018.68% 940.757us 940.757us 1 - torch_eager 20.92% 284.932us 99.61% 1.357ms 1.357ms 0.000us 0.00% 93.503us 93.503us 1 - aten::mul 11.51% 156.743us 19.57% 266.566us 11.107us 49.664us 53.78% 49.664us 2.069us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 49.664us 53.78% 49.664us 2.069us 24 - aten::copy_ 7.76% 105.742us 39.84% 542.619us 30.146us 29.343us 31.77% 30.495us 1.694us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.528us 24.39% 22.528us 1.877us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.344us 14.45% 13.344us 1.112us 12 - aten::clone 1.52% 20.734us 33.85% 461.099us 76.850us 0.000us 0.00% 7.967us 1.328us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.815us 7.38% 6.815us 1.136us 6 - aten::sub 2.96% 40.252us 4.79% 65.263us 10.877us 6.688us 7.24% 6.688us 1.115us 6 - aten::add 2.34% 31.811us 3.99% 54.311us 9.052us 6.656us 7.21% 6.656us 1.109us 6 - Activity Buffer Request 14.09% 191.853us 14.09% 191.853us 191.853us 1.152us 1.25% 1.152us 1.152us 1 - aten::empty_strided 2.30% 31.379us 2.30% 31.379us 5.230us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 13.46% 183.403us 13.46% 183.403us 30.567us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 5.20% 70.859us 6.67% 90.910us 3.788us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.47% 20.051us 1.47% 20.051us 0.835us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 16.08% 218.955us 16.08% 218.955us 4.562us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.39% 5.360us 0.39% 5.360us 5.360us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 948.157us 1025.28% 948.157us 948.157us 1 + torch_eager 11.31% 303.890us 99.80% 2.681ms 2.681ms 0.000us 0.00% 93.597us 93.597us 1 + aten::mul 5.70% 153.152us 9.94% 267.009us 11.125us 49.696us 53.74% 49.696us 2.071us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 49.696us 53.74% 49.696us 2.071us 24 + aten::copy_ 3.75% 100.883us 69.10% 1.857ms 103.143us 29.375us 31.76% 30.494us 1.694us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.592us 24.43% 22.592us 1.883us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.407us 14.50% 13.407us 1.117us 12 + aten::clone 0.85% 22.792us 66.32% 1.782ms 296.986us 0.000us 0.00% 7.902us 1.317us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.783us 7.33% 6.783us 1.131us 6 + aten::sub 1.31% 35.191us 2.17% 58.341us 9.724us 6.720us 7.27% 6.720us 1.120us 6 + aten::add 1.15% 30.820us 1.98% 53.181us 8.863us 6.687us 7.23% 6.687us 1.114us 6 + Activity Buffer Request 53.95% 1.449ms 53.95% 1.449ms 1.449ms 1.119us 1.21% 1.119us 1.119us 1 + aten::empty_strided 1.15% 30.830us 1.15% 30.830us 5.138us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 9.13% 245.326us 9.13% 245.326us 40.888us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.61% 70.171us 3.31% 88.830us 3.701us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.69% 18.659us 0.69% 18.659us 0.777us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 8.20% 220.298us 8.20% 220.298us 4.590us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.20% 5.250us 0.20% 5.250us 5.250us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.362ms -Self CUDA time total: 92.351us +Self CPU time total: 2.687ms +Self CUDA time total: 92.478us @@ -4624,27 +4624,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 945.706us 986.10% 945.706us 945.706us 1 - torch_eager 12.18% 322.968us 99.79% 2.647ms 2.647ms 0.000us 0.00% 97.216us 97.216us 1 - aten::mul 5.85% 155.091us 9.99% 264.924us 11.039us 50.947us 53.12% 50.947us 2.123us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 50.947us 53.12% 50.947us 2.123us 24 - aten::copy_ 3.92% 103.931us 67.30% 1.785ms 99.174us 30.783us 32.10% 32.095us 1.783us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.976us 23.96% 22.976us 1.915us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.174us 14.78% 14.174us 1.181us 12 - aten::clone 1.18% 31.280us 64.70% 1.716ms 286.035us 0.000us 0.00% 9.119us 1.520us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.807us 8.14% 7.807us 1.301us 6 - aten::add 1.22% 32.380us 2.09% 55.311us 9.219us 7.102us 7.41% 7.102us 1.184us 6 - aten::sub 1.50% 39.882us 2.41% 63.892us 10.649us 7.072us 7.37% 7.072us 1.179us 6 - Activity Buffer Request 53.95% 1.431ms 53.95% 1.431ms 1.431ms 1.312us 1.37% 1.312us 1.312us 1 - aten::empty_strided 1.23% 32.600us 1.23% 32.600us 5.433us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 7.05% 187.002us 7.05% 187.002us 31.167us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.63% 69.642us 3.43% 90.901us 3.788us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.80% 21.259us 0.80% 21.259us 0.886us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 8.29% 220.006us 8.29% 220.006us 4.583us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.21% 5.569us 0.21% 5.569us 5.569us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 924.823us 959.84% 924.823us 924.823us 1 + torch_eager 19.47% 279.525us 99.65% 1.430ms 1.430ms 0.000us 0.00% 97.664us 97.664us 1 + aten::mul 10.27% 147.364us 19.04% 273.370us 11.390us 51.165us 53.10% 51.165us 2.132us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 51.165us 53.10% 51.165us 2.132us 24 + aten::copy_ 7.14% 102.519us 43.74% 627.869us 34.882us 30.913us 32.08% 32.225us 1.790us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 23.040us 23.91% 23.040us 1.920us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.274us 14.81% 14.274us 1.189us 12 + aten::clone 1.45% 20.838us 38.33% 550.144us 91.691us 0.000us 0.00% 9.185us 1.531us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.873us 8.17% 7.873us 1.312us 6 + aten::add 2.18% 31.279us 3.75% 53.900us 8.983us 7.137us 7.41% 7.137us 1.189us 6 + aten::sub 2.45% 35.101us 4.11% 58.931us 9.822us 7.137us 7.41% 7.137us 1.189us 6 + Activity Buffer Request 15.34% 220.215us 15.34% 220.215us 220.215us 1.312us 1.36% 1.312us 1.312us 1 + aten::empty_strided 2.15% 30.891us 2.15% 30.891us 5.148us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 17.11% 245.545us 17.11% 245.545us 40.924us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.62% 66.322us 5.93% 85.082us 3.545us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.31% 18.760us 1.31% 18.760us 0.782us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 16.17% 232.047us 16.17% 232.047us 4.834us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.35% 5.041us 0.35% 5.041us 5.041us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.653ms -Self CUDA time total: 95.904us +Self CPU time total: 1.435ms +Self CUDA time total: 96.352us @@ -4654,27 +4654,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 963.956us 929.78% 963.956us 963.956us 1 - torch_eager 11.95% 315.942us 99.78% 2.637ms 2.637ms 0.000us 0.00% 104.988us 104.988us 1 - aten::mul 6.01% 158.721us 10.21% 269.951us 11.248us 55.295us 53.33% 55.295us 2.304us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 55.295us 53.33% 55.295us 2.304us 24 - aten::copy_ 4.03% 106.403us 67.45% 1.783ms 99.031us 32.417us 31.27% 33.729us 1.874us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.607us 23.73% 24.607us 2.051us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 15.964us 15.40% 15.964us 1.330us 12 - aten::clone 1.02% 26.870us 64.62% 1.708ms 284.615us 0.000us 0.00% 9.122us 1.520us 6 - aten::add 1.23% 32.629us 2.10% 55.390us 9.232us 7.997us 7.71% 7.997us 1.333us 6 - aten::sub 1.44% 38.041us 2.36% 62.260us 10.377us 7.967us 7.68% 7.967us 1.328us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.810us 7.53% 7.810us 1.302us 6 - Activity Buffer Request 54.08% 1.429ms 54.08% 1.429ms 1.429ms 1.312us 1.27% 1.312us 1.312us 1 - aten::empty_strided 1.27% 33.640us 1.27% 33.640us 5.607us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 6.95% 183.544us 6.95% 183.544us 30.591us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.64% 69.789us 3.42% 90.471us 3.770us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.78% 20.682us 0.78% 20.682us 0.862us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 8.39% 221.610us 8.39% 221.610us 4.617us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.22% 5.700us 0.22% 5.700us 5.700us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 915.886us 880.13% 915.886us 915.886us 1 + torch_eager 19.45% 278.057us 99.65% 1.425ms 1.425ms 0.000us 0.00% 105.374us 105.374us 1 + aten::mul 10.44% 149.250us 18.09% 258.645us 10.777us 55.325us 53.17% 55.325us 2.305us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 55.325us 53.17% 55.325us 2.305us 24 + aten::copy_ 7.22% 103.283us 44.53% 636.707us 35.373us 32.575us 31.30% 33.887us 1.883us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.703us 23.74% 24.703us 2.059us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.162us 15.53% 16.162us 1.347us 12 + aten::clone 1.49% 21.291us 38.97% 557.204us 92.867us 0.000us 0.00% 9.184us 1.531us 6 + aten::sub 2.42% 34.610us 4.09% 58.491us 9.749us 8.096us 7.78% 8.096us 1.349us 6 + aten::add 2.18% 31.210us 3.76% 53.710us 8.952us 8.066us 7.75% 8.066us 1.344us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.872us 7.56% 7.872us 1.312us 6 + Activity Buffer Request 15.88% 227.005us 15.88% 227.005us 227.005us 1.312us 1.26% 1.312us 1.312us 1 + aten::empty_strided 2.12% 30.341us 2.12% 30.341us 5.057us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 17.11% 244.667us 17.11% 244.667us 40.778us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.81% 68.755us 6.12% 87.484us 3.645us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.31% 18.729us 1.31% 18.729us 0.780us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 15.21% 217.528us 15.21% 217.528us 4.532us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.35% 5.011us 0.35% 5.011us 5.011us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.643ms -Self CUDA time total: 103.676us +Self CPU time total: 1.430ms +Self CUDA time total: 104.062us @@ -4684,27 +4684,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 933.942us 757.68% 933.942us 933.942us 1 - torch_eager 21.17% 287.829us 99.59% 1.354ms 1.354ms 0.000us 0.00% 125.024us 125.024us 1 - aten::mul 11.38% 154.770us 19.33% 262.774us 10.949us 64.862us 52.62% 64.862us 2.703us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 64.862us 52.62% 64.862us 2.703us 24 - aten::copy_ 7.76% 105.560us 40.17% 546.058us 30.337us 39.265us 31.85% 41.025us 2.279us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.865us 23.42% 28.865us 2.405us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.137us 15.53% 19.137us 1.595us 12 - aten::clone 1.51% 20.520us 34.08% 463.317us 77.220us 0.000us 0.00% 12.160us 2.027us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.400us 8.44% 10.400us 1.733us 6 - aten::sub 2.90% 39.471us 4.67% 63.511us 10.585us 9.569us 7.76% 9.569us 1.595us 6 - aten::add 2.50% 34.030us 4.22% 57.431us 9.572us 9.568us 7.76% 9.568us 1.595us 6 - Activity Buffer Request 14.30% 194.363us 14.30% 194.363us 194.363us 1.760us 1.43% 1.760us 1.760us 1 - aten::empty_strided 2.23% 30.321us 2.23% 30.321us 5.053us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 13.45% 182.914us 13.45% 182.914us 30.486us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.84% 65.748us 6.29% 85.480us 3.562us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.45% 19.732us 1.45% 19.732us 0.822us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 16.08% 218.666us 16.08% 218.666us 4.556us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.41% 5.560us 0.41% 5.560us 5.560us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 926.227us 747.17% 926.227us 926.227us 1 + torch_eager 10.87% 288.725us 99.79% 2.651ms 2.651ms 0.000us 0.00% 125.755us 125.755us 1 + aten::mul 5.66% 150.315us 9.84% 261.507us 10.896us 65.119us 52.53% 65.119us 2.713us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 65.119us 52.53% 65.119us 2.713us 24 + aten::copy_ 3.77% 100.152us 69.45% 1.845ms 102.495us 39.455us 31.83% 41.246us 2.291us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.928us 23.34% 28.928us 2.411us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.390us 15.64% 19.390us 1.616us 12 + aten::clone 0.89% 23.522us 66.73% 1.773ms 295.426us 0.000us 0.00% 12.318us 2.053us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.527us 8.49% 10.527us 1.755us 6 + aten::add 1.16% 30.840us 2.00% 53.221us 8.870us 9.759us 7.87% 9.759us 1.626us 6 + aten::sub 1.31% 34.853us 2.22% 58.863us 9.811us 9.631us 7.77% 9.631us 1.605us 6 + Activity Buffer Request 54.50% 1.448ms 54.50% 1.448ms 1.448ms 1.791us 1.44% 1.791us 1.791us 1 + aten::empty_strided 1.16% 30.740us 1.16% 30.740us 5.123us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 8.93% 237.245us 8.93% 237.245us 39.541us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.65% 70.502us 3.36% 89.223us 3.718us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.70% 18.721us 0.70% 18.721us 0.780us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 8.19% 217.516us 8.19% 217.516us 4.532us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.21% 5.590us 0.21% 5.590us 5.590us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.359ms -Self CUDA time total: 123.264us +Self CPU time total: 2.656ms +Self CUDA time total: 123.964us @@ -4714,27 +4714,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 934.359us 900.66% 934.359us 934.359us 1 - torch_eager 21.17% 286.322us 99.59% 1.347ms 1.347ms 0.000us 0.00% 105.086us 105.086us 1 - aten::mul 11.62% 157.214us 19.66% 265.945us 11.081us 55.327us 53.33% 55.327us 2.305us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 55.327us 53.33% 55.327us 2.305us 24 - aten::copy_ 7.65% 103.495us 39.66% 536.482us 29.805us 32.511us 31.34% 33.855us 1.881us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.704us 23.81% 24.704us 2.059us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 15.904us 15.33% 15.904us 1.325us 12 - aten::clone 1.57% 21.280us 33.91% 458.650us 76.442us 0.000us 0.00% 9.151us 1.525us 6 - aten::add 2.43% 32.883us 4.09% 55.372us 9.229us 8.001us 7.71% 8.001us 1.333us 6 - aten::sub 2.87% 38.810us 4.64% 62.781us 10.463us 7.903us 7.62% 7.903us 1.317us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.807us 7.53% 7.807us 1.301us 6 - Activity Buffer Request 14.06% 190.184us 14.06% 190.184us 190.184us 1.344us 1.30% 1.344us 1.344us 1 - aten::empty_strided 2.22% 30.070us 2.22% 30.070us 5.012us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 13.39% 181.103us 13.39% 181.103us 30.184us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 5.05% 68.302us 6.56% 88.771us 3.699us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.51% 20.469us 1.51% 20.469us 0.853us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 16.03% 216.891us 16.03% 216.891us 4.519us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.41% 5.591us 0.41% 5.591us 5.591us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 925.174us 889.06% 925.174us 925.174us 1 + torch_eager 20.56% 293.458us 99.64% 1.423ms 1.423ms 0.000us 0.00% 105.438us 105.438us 1 + aten::mul 10.42% 148.708us 18.32% 261.500us 10.896us 55.264us 53.11% 55.264us 2.303us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 55.264us 53.11% 55.264us 2.303us 24 + aten::copy_ 7.08% 101.081us 43.33% 618.656us 34.370us 32.670us 31.39% 34.046us 1.891us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.703us 23.74% 24.703us 2.059us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.128us 15.50% 16.128us 1.344us 12 + aten::clone 1.49% 21.220us 38.03% 542.913us 90.485us 0.000us 0.00% 9.343us 1.557us 6 + aten::sub 2.38% 33.992us 4.03% 57.481us 9.580us 8.064us 7.75% 8.064us 1.344us 6 + aten::add 2.21% 31.510us 3.80% 54.250us 9.042us 8.064us 7.75% 8.064us 1.344us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.967us 7.66% 7.967us 1.328us 6 + Activity Buffer Request 14.99% 214.036us 14.99% 214.036us 214.036us 1.376us 1.32% 1.376us 1.376us 1 + aten::empty_strided 2.13% 30.461us 2.13% 30.461us 5.077us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 17.05% 243.458us 17.05% 243.458us 40.576us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.68% 66.831us 5.99% 85.500us 3.562us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.31% 18.669us 1.31% 18.669us 0.778us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 15.35% 219.102us 15.35% 219.102us 4.565us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.36% 5.101us 0.36% 5.101us 5.101us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.353ms -Self CUDA time total: 103.742us +Self CPU time total: 1.428ms +Self CUDA time total: 104.062us @@ -4744,27 +4744,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 944.694us 764.03% 944.694us 944.694us 1 - torch_eager 20.48% 287.824us 99.60% 1.400ms 1.400ms 0.000us 0.00% 125.438us 125.438us 1 - aten::mul 10.91% 153.363us 18.83% 264.625us 11.026us 65.151us 52.69% 65.151us 2.715us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 65.151us 52.69% 65.151us 2.715us 24 - aten::copy_ 7.88% 110.793us 41.73% 586.532us 32.585us 39.328us 31.81% 41.120us 2.284us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.895us 23.37% 28.895us 2.408us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.167us 15.50% 19.167us 1.597us 12 - aten::clone 1.52% 21.310us 35.87% 504.089us 84.015us 0.000us 0.00% 12.225us 2.038us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.433us 8.44% 10.433us 1.739us 6 - aten::sub 2.80% 39.332us 4.57% 64.213us 10.702us 9.632us 7.79% 9.632us 1.605us 6 - aten::add 2.33% 32.799us 3.97% 55.790us 9.298us 9.535us 7.71% 9.535us 1.589us 6 - Activity Buffer Request 15.08% 211.984us 15.08% 211.984us 211.984us 1.792us 1.45% 1.792us 1.792us 1 - aten::empty_strided 2.18% 30.690us 2.18% 30.690us 5.115us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 14.35% 201.734us 14.35% 201.734us 33.622us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.89% 68.724us 6.32% 88.851us 3.702us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.43% 20.127us 1.43% 20.127us 0.839us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 15.74% 221.155us 15.74% 221.155us 4.607us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.40% 5.570us 0.40% 5.570us 5.570us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 901.909us 727.35% 901.909us 901.909us 1 + torch_eager 19.87% 274.810us 99.60% 1.377ms 1.377ms 0.000us 0.00% 125.791us 125.791us 1 + aten::mul 10.85% 149.967us 18.79% 259.807us 10.825us 65.086us 52.49% 65.086us 2.712us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 65.086us 52.49% 65.086us 2.712us 24 + aten::copy_ 7.46% 103.216us 42.83% 592.168us 32.898us 39.518us 31.87% 41.310us 2.295us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.862us 23.28% 28.862us 2.405us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.395us 15.64% 19.395us 1.616us 12 + aten::clone 1.61% 22.200us 37.56% 519.385us 86.564us 0.000us 0.00% 12.448us 2.075us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.656us 8.59% 10.656us 1.776us 6 + aten::add 2.23% 30.899us 3.81% 52.660us 8.777us 9.730us 7.85% 9.730us 1.622us 6 + aten::sub 2.44% 33.801us 4.13% 57.151us 9.525us 9.665us 7.79% 9.665us 1.611us 6 + Activity Buffer Request 13.62% 188.345us 13.62% 188.345us 188.345us 1.792us 1.45% 1.792us 1.792us 1 + aten::empty_strided 2.34% 32.371us 2.34% 32.371us 5.395us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 17.39% 240.467us 17.39% 240.467us 40.078us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.87% 67.397us 6.22% 86.038us 3.585us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.35% 18.641us 1.35% 18.641us 0.777us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 15.56% 215.091us 15.56% 215.091us 4.481us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.40% 5.540us 0.40% 5.540us 5.540us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.405ms -Self CUDA time total: 123.646us +Self CPU time total: 1.383ms +Self CUDA time total: 123.999us @@ -4774,27 +4774,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 938.077us 529.63% 938.077us 938.077us 1 - torch_eager 22.00% 288.844us 99.57% 1.307ms 1.307ms 0.000us 0.00% 179.967us 179.967us 1 - aten::mul 11.92% 156.562us 20.13% 264.245us 11.010us 94.881us 53.57% 94.881us 3.953us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 94.881us 53.57% 94.881us 3.953us 24 - aten::copy_ 8.04% 105.524us 37.72% 495.290us 27.516us 57.663us 32.56% 60.511us 3.362us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.672us 22.96% 40.672us 3.389us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.575us 13.87% 24.575us 2.048us 12 - aten::clone 1.60% 21.071us 31.51% 413.758us 68.960us 0.000us 0.00% 19.839us 3.306us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.991us 9.59% 16.991us 2.832us 6 - aten::add 2.42% 31.800us 4.16% 54.561us 9.093us 12.288us 6.94% 12.288us 2.048us 6 - aten::sub 3.05% 40.090us 5.01% 65.752us 10.959us 12.287us 6.94% 12.287us 2.048us 6 - Activity Buffer Request 10.75% 141.113us 10.75% 141.113us 141.113us 2.848us 1.61% 2.848us 2.848us 1 - aten::empty_strided 2.28% 29.940us 2.28% 29.940us 4.990us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 14.21% 186.543us 14.21% 186.543us 31.091us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 5.18% 67.990us 6.68% 87.660us 3.652us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.50% 19.670us 1.50% 19.670us 0.820us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 16.62% 218.216us 16.62% 218.216us 4.546us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.43% 5.650us 0.43% 5.650us 5.650us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 944.661us 533.26% 944.661us 944.661us 1 + torch_eager 10.70% 284.298us 99.79% 2.652ms 2.652ms 0.000us 0.00% 180.029us 180.029us 1 + aten::mul 6.06% 161.074us 10.27% 272.980us 11.374us 94.781us 53.50% 94.781us 3.949us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 94.781us 53.50% 94.781us 3.949us 24 + aten::copy_ 3.97% 105.392us 69.06% 1.835ms 101.961us 57.664us 32.55% 60.545us 3.364us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.607us 22.92% 40.607us 3.384us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.703us 13.94% 24.703us 2.059us 12 + aten::clone 0.89% 23.759us 66.19% 1.759ms 293.179us 0.000us 0.00% 19.938us 3.323us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.057us 9.63% 17.057us 2.843us 6 + aten::sub 1.37% 36.511us 2.33% 61.971us 10.329us 12.383us 6.99% 12.383us 2.064us 6 + aten::add 1.17% 31.070us 2.01% 53.400us 8.900us 12.320us 6.95% 12.320us 2.053us 6 + Activity Buffer Request 53.91% 1.433ms 53.91% 1.433ms 1.433ms 2.881us 1.63% 2.881us 2.881us 1 + aten::empty_strided 1.17% 31.132us 1.17% 31.132us 5.189us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 8.85% 235.245us 8.85% 235.245us 39.208us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.64% 70.123us 3.36% 89.202us 3.717us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.72% 19.079us 0.72% 19.079us 0.795us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 8.35% 221.788us 8.35% 221.788us 4.621us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.21% 5.460us 0.21% 5.460us 5.460us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.313ms -Self CUDA time total: 177.119us +Self CPU time total: 2.657ms +Self CUDA time total: 177.148us @@ -4804,27 +4804,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 945.076us 318.26% 945.076us 945.076us 1 - torch_eager 21.55% 289.808us 99.58% 1.339ms 1.339ms 0.000us 0.00% 314.171us 314.171us 1 - aten::mul 11.43% 153.633us 19.62% 263.817us 10.992us 145.952us 49.15% 145.952us 6.081us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 145.952us 49.15% 145.952us 6.081us 24 - aten::copy_ 9.11% 122.489us 38.99% 524.297us 29.128us 110.173us 37.10% 127.389us 7.077us 18 - aten::clone 1.65% 22.169us 33.13% 445.468us 74.245us 0.000us 0.00% 70.110us 11.685us 6 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.279us 19.29% 57.279us 4.773us 12 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 52.894us 17.81% 52.894us 8.816us 6 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 40.830us 13.75% 40.830us 3.402us 12 - aten::sub 2.94% 39.549us 4.81% 64.690us 10.782us 20.511us 6.91% 20.511us 3.418us 6 - aten::add 2.41% 32.411us 4.09% 55.020us 9.170us 20.319us 6.84% 20.319us 3.386us 6 - Activity Buffer Request 11.32% 152.193us 11.32% 152.193us 152.193us 17.216us 5.80% 17.216us 17.216us 1 - aten::empty_strided 2.31% 31.082us 2.31% 31.082us 5.180us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 13.88% 186.593us 13.88% 186.593us 31.099us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 5.09% 68.450us 6.56% 88.160us 3.673us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.47% 19.710us 1.47% 19.710us 0.821us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 16.43% 220.956us 16.43% 220.956us 4.603us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.42% 5.661us 0.42% 5.661us 5.661us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 954.138us 321.69% 954.138us 954.138us 1 + torch_eager 11.45% 309.471us 99.80% 2.697ms 2.697ms 0.000us 0.00% 313.854us 313.854us 1 + aten::mul 5.62% 151.933us 9.84% 265.955us 11.081us 144.896us 48.85% 144.896us 6.037us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 144.896us 48.85% 144.896us 6.037us 24 + aten::copy_ 3.99% 107.722us 68.69% 1.856ms 103.120us 111.039us 37.44% 128.287us 7.127us 18 + aten::clone 1.05% 28.369us 65.82% 1.779ms 296.444us 0.000us 0.00% 70.944us 11.824us 6 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.343us 19.33% 57.343us 4.779us 12 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.696us 18.10% 53.696us 8.949us 6 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 40.671us 13.71% 40.671us 3.389us 12 + aten::sub 1.32% 35.620us 2.23% 60.211us 10.035us 20.448us 6.89% 20.448us 3.408us 6 + aten::add 1.16% 31.420us 1.99% 53.831us 8.972us 20.223us 6.82% 20.223us 3.371us 6 + Activity Buffer Request 53.66% 1.450ms 53.66% 1.450ms 1.450ms 17.248us 5.82% 17.248us 17.248us 1 + aten::empty_strided 1.25% 33.832us 1.25% 33.832us 5.639us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 8.57% 231.556us 8.57% 231.556us 38.593us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.58% 69.773us 3.29% 88.953us 3.706us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.71% 19.180us 0.71% 19.180us 0.799us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 8.44% 228.015us 8.44% 228.015us 4.750us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.20% 5.370us 0.20% 5.370us 5.370us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.345ms -Self CUDA time total: 296.955us +Self CPU time total: 2.702ms +Self CUDA time total: 296.606us @@ -4834,27 +4834,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 986.080us 556.73% 986.080us 986.080us 1 - torch_eager 12.52% 336.567us 99.81% 2.683ms 2.683ms 0.000us 0.00% 179.999us 179.999us 1 - aten::mul 5.82% 156.365us 9.99% 268.575us 11.191us 94.976us 53.62% 94.976us 3.957us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 94.976us 53.62% 94.976us 3.957us 24 - aten::copy_ 3.98% 106.939us 67.04% 1.802ms 100.094us 57.535us 32.48% 60.415us 3.356us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.703us 22.98% 40.703us 3.392us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.608us 13.89% 24.608us 2.051us 12 - aten::clone 1.08% 29.091us 64.22% 1.726ms 287.668us 0.000us 0.00% 19.712us 3.285us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.832us 9.50% 16.832us 2.805us 6 - aten::add 1.21% 32.499us 2.06% 55.240us 9.207us 12.320us 6.96% 12.320us 2.053us 6 - aten::sub 1.59% 42.650us 2.57% 69.041us 11.507us 12.288us 6.94% 12.288us 2.048us 6 - Activity Buffer Request 53.52% 1.438ms 53.52% 1.438ms 1.438ms 2.880us 1.63% 2.880us 2.880us 1 - aten::empty_strided 1.16% 31.221us 1.16% 31.221us 5.204us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 7.20% 193.473us 7.20% 193.473us 32.245us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.61% 70.195us 3.39% 91.232us 3.801us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.78% 21.037us 0.78% 21.037us 0.877us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 8.35% 224.324us 8.35% 224.324us 4.673us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.19% 4.980us 0.19% 4.980us 4.980us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 930.130us 525.53% 930.130us 930.130us 1 + torch_eager 19.64% 282.826us 99.65% 1.435ms 1.435ms 0.000us 0.00% 179.836us 179.836us 1 + aten::mul 10.48% 150.844us 18.43% 265.387us 11.058us 94.845us 53.59% 94.845us 3.952us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 94.845us 53.59% 94.845us 3.952us 24 + aten::copy_ 8.38% 120.684us 44.09% 634.887us 35.272us 57.502us 32.49% 60.350us 3.353us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.478us 22.87% 40.478us 3.373us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.641us 13.92% 24.641us 2.053us 12 + aten::clone 1.49% 21.461us 38.48% 554.053us 92.342us 0.000us 0.00% 19.872us 3.312us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.024us 9.62% 17.024us 2.837us 6 + aten::sub 2.41% 34.731us 4.09% 58.881us 9.813us 12.353us 6.98% 12.353us 2.059us 6 + aten::add 2.13% 30.662us 3.72% 53.511us 8.919us 12.288us 6.94% 12.288us 2.048us 6 + Activity Buffer Request 15.30% 220.275us 15.30% 220.275us 220.275us 2.848us 1.61% 2.848us 2.848us 1 + aten::empty_strided 2.11% 30.450us 2.11% 30.450us 5.075us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 15.99% 230.296us 15.99% 230.296us 38.383us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.74% 68.240us 6.08% 87.483us 3.645us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.34% 19.243us 1.34% 19.243us 0.802us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 15.64% 225.174us 15.64% 225.174us 4.691us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.35% 5.110us 0.35% 5.110us 5.110us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.688ms -Self CUDA time total: 177.119us +Self CPU time total: 1.440ms +Self CUDA time total: 176.988us @@ -4864,27 +4864,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 955.007us 321.87% 955.007us 955.007us 1 - torch_eager 21.61% 290.382us 99.58% 1.338ms 1.338ms 0.000us 0.00% 314.050us 314.050us 1 - aten::mul 12.35% 165.965us 20.49% 275.388us 11.475us 146.274us 49.30% 146.274us 6.095us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 146.274us 49.30% 146.274us 6.095us 24 - aten::copy_ 7.99% 107.375us 38.18% 513.111us 28.506us 109.984us 37.07% 127.328us 7.074us 18 - aten::clone 1.53% 20.570us 31.98% 429.868us 71.645us 0.000us 0.00% 70.048us 11.675us 6 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.280us 19.31% 57.280us 4.773us 12 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 52.704us 17.76% 52.704us 8.784us 6 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 40.448us 13.63% 40.448us 3.371us 12 - aten::sub 2.99% 40.150us 4.79% 64.400us 10.733us 20.288us 6.84% 20.288us 3.381us 6 - aten::add 2.45% 32.907us 4.13% 55.499us 9.250us 20.160us 6.79% 20.160us 3.360us 6 - Activity Buffer Request 11.77% 158.223us 11.77% 158.223us 158.223us 17.344us 5.85% 17.344us 17.344us 1 - aten::empty_strided 2.28% 30.711us 2.28% 30.711us 5.118us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 13.78% 185.224us 13.78% 185.224us 30.871us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 5.13% 68.942us 6.58% 88.372us 3.682us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.45% 19.430us 1.45% 19.430us 0.810us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 16.26% 218.554us 16.26% 218.554us 4.553us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.42% 5.611us 0.42% 5.611us 5.611us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 931.347us 313.60% 931.347us 931.347us 1 + torch_eager 20.13% 283.358us 99.65% 1.403ms 1.403ms 0.000us 0.00% 314.679us 314.679us 1 + aten::mul 10.72% 150.883us 18.79% 264.457us 11.019us 145.371us 48.95% 145.371us 6.057us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 145.371us 48.95% 145.371us 6.057us 24 + aten::copy_ 7.40% 104.164us 42.97% 604.868us 33.604us 110.845us 37.32% 128.541us 7.141us 18 + aten::clone 1.53% 21.600us 37.15% 522.944us 87.157us 0.000us 0.00% 71.357us 11.893us 6 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.184us 19.25% 57.184us 4.765us 12 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.661us 18.07% 53.661us 8.944us 6 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 40.767us 13.73% 40.767us 3.397us 12 + aten::add 2.28% 32.151us 3.88% 54.682us 9.114us 20.446us 6.88% 20.446us 3.408us 6 + aten::sub 2.39% 33.622us 4.06% 57.171us 9.528us 20.321us 6.84% 20.321us 3.387us 6 + Activity Buffer Request 14.77% 207.975us 14.77% 207.975us 207.975us 17.696us 5.96% 17.696us 17.696us 1 + aten::empty_strided 2.15% 30.270us 2.15% 30.270us 5.045us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 16.22% 228.377us 16.22% 228.377us 38.063us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.75% 66.830us 6.13% 86.290us 3.595us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.38% 19.460us 1.38% 19.460us 0.811us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 15.91% 224.006us 15.91% 224.006us 4.667us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.35% 4.971us 0.35% 4.971us 4.971us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.344ms -Self CUDA time total: 296.706us +Self CPU time total: 1.408ms +Self CUDA time total: 296.983us @@ -4894,27 +4894,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 962.939us 164.48% 962.939us 962.939us 1 - torch_eager 21.30% 292.019us 99.59% 1.365ms 1.365ms 0.000us 0.00% 609.117us 609.117us 1 - aten::copy_ 7.59% 104.052us 39.10% 536.059us 29.781us 268.735us 45.90% 292.415us 16.245us 18 - aten::mul 11.61% 159.130us 19.77% 271.083us 11.295us 251.454us 42.95% 251.454us 10.477us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 251.454us 42.95% 251.454us 10.477us 24 - aten::clone 1.60% 21.919us 33.19% 455.067us 75.844us 0.000us 0.00% 201.504us 33.584us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 177.824us 30.37% 177.824us 29.637us 6 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 90.911us 15.53% 90.911us 7.576us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 65.248us 11.15% 65.248us 5.437us 12 - aten::sub 2.98% 40.869us 4.94% 67.700us 11.283us 32.703us 5.59% 32.703us 5.451us 6 - aten::add 2.40% 32.850us 4.07% 55.841us 9.307us 32.545us 5.56% 32.545us 5.424us 6 - Activity Buffer Request 13.18% 180.724us 13.18% 180.724us 180.724us 23.680us 4.04% 23.680us 23.680us 1 - aten::empty_strided 2.23% 30.541us 2.23% 30.541us 5.090us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 13.71% 188.023us 13.71% 188.023us 31.337us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 5.13% 70.322us 6.59% 90.292us 3.762us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.46% 19.970us 1.46% 19.970us 0.832us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 16.41% 225.035us 16.41% 225.035us 4.688us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.41% 5.640us 0.41% 5.640us 5.640us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 931.511us 159.85% 931.511us 931.511us 1 + torch_eager 19.89% 283.237us 99.62% 1.419ms 1.419ms 0.000us 0.00% 606.457us 606.457us 1 + aten::copy_ 7.21% 102.593us 43.52% 619.697us 34.428us 267.708us 45.94% 291.419us 16.190us 18 + aten::mul 10.56% 150.425us 18.55% 264.165us 11.007us 249.406us 42.80% 249.406us 10.392us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 249.406us 42.80% 249.406us 10.392us 24 + aten::clone 1.52% 21.631us 38.04% 541.603us 90.267us 0.000us 0.00% 201.277us 33.546us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 177.566us 30.47% 177.566us 29.594us 6 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 90.142us 15.47% 90.142us 7.512us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 65.632us 11.26% 65.632us 5.469us 12 + aten::add 2.16% 30.762us 3.77% 53.662us 8.944us 32.832us 5.63% 32.832us 5.472us 6 + aten::sub 2.53% 36.013us 4.23% 60.192us 10.032us 32.800us 5.63% 32.800us 5.467us 6 + Activity Buffer Request 14.90% 212.145us 14.90% 212.145us 212.145us 23.711us 4.07% 23.711us 23.711us 1 + aten::empty_strided 2.14% 30.440us 2.14% 30.440us 5.073us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 16.99% 241.846us 16.99% 241.846us 40.308us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.71% 67.093us 6.00% 85.482us 3.562us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.29% 18.389us 1.29% 18.389us 0.766us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 15.73% 223.932us 15.73% 223.932us 4.665us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.38% 5.360us 0.38% 5.360us 5.360us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.371ms -Self CUDA time total: 585.437us +Self CPU time total: 1.424ms +Self CUDA time total: 582.746us @@ -4924,55 +4924,61 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 9.18% 318.848us 77.56% 2.693ms 2.693ms 0.000us 0.00% 1.840ms 1.840ms 1 - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.811ms 102.06% 1.811ms 1.811ms 1 - aten::copy_ 3.19% 110.682us 53.02% 1.841ms 102.257us 792.737us 44.68% 858.369us 47.687us 18 - aten::mul 4.39% 152.554us 7.57% 262.845us 10.952us 833.316us 46.97% 833.316us 34.721us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 833.316us 46.97% 833.316us 34.721us 24 - aten::clone 0.79% 27.538us 50.82% 1.764ms 294.050us 0.000us 0.00% 624.865us 104.144us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 559.233us 31.52% 559.233us 93.206us 6 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 233.504us 13.16% 233.504us 19.459us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 148.032us 8.34% 148.032us 12.336us 12 - aten::sub 1.13% 39.132us 1.88% 65.111us 10.852us 90.112us 5.08% 90.112us 15.019us 6 - Activity Buffer Request 41.37% 1.436ms 41.37% 1.436ms 1.436ms 65.632us 3.70% 65.632us 65.632us 1 - aten::add 0.97% 33.650us 1.61% 56.062us 9.344us 57.920us 3.26% 57.920us 9.653us 6 - aten::empty_strided 0.92% 31.941us 0.92% 31.941us 5.324us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 6.62% 229.834us 6.62% 229.834us 38.306us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.00% 69.363us 2.59% 89.831us 3.743us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.59% 20.468us 0.59% 20.468us 0.853us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 6.41% 222.613us 6.41% 222.613us 4.638us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 22.44% 778.913us 22.44% 778.913us 778.913us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 13.84% 306.170us 64.60% 1.429ms 1.429ms 0.000us 0.00% 1.835ms 1.835ms 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.808ms 102.17% 1.808ms 1.808ms 1 + aten::copy_ 5.17% 114.346us 26.90% 594.995us 33.055us 791.984us 44.77% 858.095us 47.672us 18 + aten::mul 6.78% 150.032us 12.17% 269.044us 11.210us 828.790us 46.85% 828.790us 34.533us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 828.790us 46.85% 828.790us 34.533us 24 + aten::clone 1.04% 23.090us 22.74% 502.934us 83.822us 0.000us 0.00% 626.230us 104.372us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 560.119us 31.66% 560.119us 93.353us 6 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 231.865us 13.11% 231.865us 19.322us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 148.413us 8.39% 148.413us 12.368us 12 + aten::sub 1.69% 37.309us 2.75% 60.900us 10.150us 90.142us 5.10% 90.142us 15.024us 6 + Activity Buffer Request 8.38% 185.324us 8.38% 185.324us 185.324us 66.111us 3.74% 66.111us 66.111us 1 + aten::add 1.41% 31.181us 2.49% 55.022us 9.170us 58.271us 3.29% 58.271us 9.712us 6 + aten::empty_strided 1.45% 31.982us 1.45% 31.982us 5.330us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 10.29% 227.584us 10.29% 227.584us 37.931us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.11% 68.695us 3.96% 87.553us 3.648us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.85% 18.858us 0.85% 18.858us 0.786us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 10.59% 234.185us 10.59% 234.185us 4.879us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 35.40% 782.770us 35.40% 782.770us 782.770us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 3.472ms -Self CUDA time total: 1.774ms +Self CPU time total: 2.212ms +Self CUDA time total: 1.769ms impl wl p50(ms) ok -torch_eager cuda_B1_S128_H32_D128_R64 0.22 True -torch_eager cuda_B1_S128_H32_D64_R32 0.23 True -torch_eager cuda_B1_S128_H8_D128_R64 0.23 True +torch_eager cuda_B1_S128_H32_D128_R64 0.21 True +torch_eager cuda_B1_S128_H32_D64_R32 0.22 True +torch_eager cuda_B1_S128_H8_D128_R64 0.22 True torch_eager cuda_B1_S128_H8_D64_R32 0.17 True -torch_eager cuda_B1_S2048_H32_D128_R64 0.23 True -torch_eager cuda_B1_S2048_H32_D64_R32 0.22 True -torch_eager cuda_B1_S2048_H8_D128_R64 0.22 True +torch_eager cuda_B1_S2048_H32_D128_R64 0.22 True +torch_eager cuda_B1_S2048_H32_D64_R32 0.21 True +torch_eager cuda_B1_S2048_H8_D128_R64 0.21 True torch_eager cuda_B1_S2048_H8_D64_R32 0.22 True torch_eager cuda_B1_S512_H32_D128_R64 0.22 True -torch_eager cuda_B1_S512_H32_D64_R32 0.22 True -torch_eager cuda_B1_S512_H8_D128_R64 0.22 True -torch_eager cuda_B1_S512_H8_D64_R32 0.22 True -torch_eager cuda_B2_S128_H32_D128_R64 0.22 True +torch_eager cuda_B1_S512_H32_D64_R32 0.21 True +torch_eager cuda_B1_S512_H8_D128_R64 0.21 True +torch_eager cuda_B1_S512_H8_D64_R32 0.21 True +torch_eager cuda_B2_S128_H32_D128_R64 0.21 True torch_eager cuda_B2_S128_H32_D64_R32 0.22 True -torch_eager cuda_B2_S128_H8_D128_R64 0.22 True +torch_eager cuda_B2_S128_H8_D128_R64 0.21 True torch_eager cuda_B2_S128_H8_D64_R32 0.22 True torch_eager cuda_B2_S2048_H32_D128_R64 0.64 True torch_eager cuda_B2_S2048_H32_D64_R32 0.23 True -torch_eager cuda_B2_S2048_H8_D128_R64 0.23 True +torch_eager cuda_B2_S2048_H8_D128_R64 0.22 True torch_eager cuda_B2_S2048_H8_D64_R32 0.22 True torch_eager cuda_B2_S512_H32_D128_R64 0.22 True -torch_eager cuda_B2_S512_H32_D64_R32 0.23 True -torch_eager cuda_B2_S512_H8_D128_R64 0.22 True -torch_eager cuda_B2_S512_H8_D64_R32 0.22 True +torch_eager cuda_B2_S512_H32_D64_R32 0.22 True +torch_eager cuda_B2_S512_H8_D128_R64 0.21 True +torch_eager cuda_B2_S512_H8_D64_R32 0.21 True
+
+
▶ UV Install Logs
+ +

Artifacts:

rotary.jsonl diff --git a/rotary/results/artifacts/combine/latency.svg b/rotary/results/artifacts/combine/latency.svg index 7e204b4a84d9a5e16538227357f7ff28e8f5c02e..36f9217b1247fd55602a048202775fbf3d19cd24 100644 --- a/rotary/results/artifacts/combine/latency.svg +++ b/rotary/results/artifacts/combine/latency.svg @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b5f227e4c6029d72861d1a351c2c4353a8589dfeadaaf2aa034c7c28ec49a733 -size 37854 +oid sha256:1df6ff7a8f4a24eba95824695c07fcf25601f7f648a0a0773f7d1bc7119d9fd2 +size 37849 diff --git a/rotary/results/combined_results.html b/rotary/results/combined_results.html index 7f3eb837fca79886475bc80b225a1aa3afc35557..28fa630c24a01c9c2557497761cffaee1e2da610 100644 --- a/rotary/results/combined_results.html +++ b/rotary/results/combined_results.html @@ -4107,7 +4107,7 @@ body[data-tool="eraser"] .main-content { - 2025-10-30T15:53:49.568408 + 2025-10-31T20:14:10.200761 image/svg+xml @@ -4451,109 +4451,109 @@ body[data-tool="eraser"] .main-content { - + - + - 0.1 + 0.1 - + - + - 0.2 + 0.2 - + - + - 0.3 + 0.3 - + - + - 0.4 + 0.4 - + - + - 0.5 + 0.5 - + - + - 0.6 + 0.6 - + - + - 0.7 + 0.7 - + - + - 0.8 + 0.8 @@ -4561,67 +4561,67 @@ body[data-tool="eraser"] .main-content { - + - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + - + - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + @@ -4679,7 +4679,7 @@ body[data-tool="eraser"] .main-content { ▼ output ▶ uv-logs | -Cell: combine | 4.37s +Cell: combine | 4.46s | Raw @@ -4771,8 +4771,8 @@ hf_kernels_rotary cuda_B1_S128_H32_D64_R32 0.09 True hf_kernels_rotary cuda_B1_S128_H8_D128_R64 0.09 True hf_kernels_rotary cuda_B1_S128_H8_D64_R32 0.08 True hf_kernels_rotary cuda_B1_S2048_H32_D128_R64 0.26 True -hf_kernels_rotary cuda_B1_S2048_H32_D64_R32 0.09 True -hf_kernels_rotary cuda_B1_S2048_H8_D128_R64 0.09 True +hf_kernels_rotary cuda_B1_S2048_H32_D64_R32 0.10 True +hf_kernels_rotary cuda_B1_S2048_H8_D128_R64 0.10 True hf_kernels_rotary cuda_B1_S2048_H8_D64_R32 0.09 True hf_kernels_rotary cuda_B1_S512_H32_D128_R64 0.09 True hf_kernels_rotary cuda_B1_S512_H32_D64_R32 0.09 True @@ -4783,37 +4783,37 @@ hf_kernels_rotary cuda_B2_S128_H32_D64_R32 0.09 True hf_kernels_rotary cuda_B2_S128_H8_D128_R64 0.09 True hf_kernels_rotary cuda_B2_S128_H8_D64_R32 0.09 True hf_kernels_rotary cuda_B2_S2048_H32_D128_R64 0.85 True -hf_kernels_rotary cuda_B2_S2048_H32_D64_R32 0.26 True +hf_kernels_rotary cuda_B2_S2048_H32_D64_R32 0.27 True hf_kernels_rotary cuda_B2_S2048_H8_D128_R64 0.09 True hf_kernels_rotary cuda_B2_S2048_H8_D64_R32 0.09 True hf_kernels_rotary cuda_B2_S512_H32_D128_R64 0.09 True hf_kernels_rotary cuda_B2_S512_H32_D64_R32 0.09 True hf_kernels_rotary cuda_B2_S512_H8_D128_R64 0.09 True hf_kernels_rotary cuda_B2_S512_H8_D64_R32 0.09 True -torch_eager cuda_B1_S128_H32_D128_R64 0.22 True -torch_eager cuda_B1_S128_H32_D64_R32 0.23 True -torch_eager cuda_B1_S128_H8_D128_R64 0.23 True +torch_eager cuda_B1_S128_H32_D128_R64 0.21 True +torch_eager cuda_B1_S128_H32_D64_R32 0.22 True +torch_eager cuda_B1_S128_H8_D128_R64 0.22 True torch_eager cuda_B1_S128_H8_D64_R32 0.17 True -torch_eager cuda_B1_S2048_H32_D128_R64 0.23 True -torch_eager cuda_B1_S2048_H32_D64_R32 0.22 True -torch_eager cuda_B1_S2048_H8_D128_R64 0.22 True +torch_eager cuda_B1_S2048_H32_D128_R64 0.22 True +torch_eager cuda_B1_S2048_H32_D64_R32 0.21 True +torch_eager cuda_B1_S2048_H8_D128_R64 0.21 True torch_eager cuda_B1_S2048_H8_D64_R32 0.22 True torch_eager cuda_B1_S512_H32_D128_R64 0.22 True -torch_eager cuda_B1_S512_H32_D64_R32 0.22 True -torch_eager cuda_B1_S512_H8_D128_R64 0.22 True -torch_eager cuda_B1_S512_H8_D64_R32 0.22 True -torch_eager cuda_B2_S128_H32_D128_R64 0.22 True +torch_eager cuda_B1_S512_H32_D64_R32 0.21 True +torch_eager cuda_B1_S512_H8_D128_R64 0.21 True +torch_eager cuda_B1_S512_H8_D64_R32 0.21 True +torch_eager cuda_B2_S128_H32_D128_R64 0.21 True torch_eager cuda_B2_S128_H32_D64_R32 0.22 True -torch_eager cuda_B2_S128_H8_D128_R64 0.22 True +torch_eager cuda_B2_S128_H8_D128_R64 0.21 True torch_eager cuda_B2_S128_H8_D64_R32 0.22 True torch_eager cuda_B2_S2048_H32_D128_R64 0.64 True torch_eager cuda_B2_S2048_H32_D64_R32 0.23 True -torch_eager cuda_B2_S2048_H8_D128_R64 0.23 True +torch_eager cuda_B2_S2048_H8_D128_R64 0.22 True torch_eager cuda_B2_S2048_H8_D64_R32 0.22 True torch_eager cuda_B2_S512_H32_D128_R64 0.22 True -torch_eager cuda_B2_S512_H32_D64_R32 0.23 True -torch_eager cuda_B2_S512_H8_D128_R64 0.22 True -torch_eager cuda_B2_S512_H8_D64_R32 0.22 True +torch_eager cuda_B2_S512_H32_D64_R32 0.22 True +torch_eager cuda_B2_S512_H8_D128_R64 0.21 True +torch_eager cuda_B2_S512_H8_D64_R32 0.21 True GENERATING COMBINED VISUALIZATION @@ -4833,7 +4833,7 @@ Implementations included:
▶ UV Install Logs
@@ -4846,7 +4846,7 @@ Installed 37 packages in 193ms - 2025-10-30T15:53:49.568408 + 2025-10-31T20:14:10.200761 image/svg+xml @@ -5190,109 +5190,109 @@ Installed 37 packages in 193ms - + - + - 0.1 + 0.1 - + - + - 0.2 + 0.2 - + - + - 0.3 + 0.3 - + - + - 0.4 + 0.4 - + - + - 0.5 + 0.5 - + - + - 0.6 + 0.6 - + - + - 0.7 + 0.7 - + - + - 0.8 + 0.8 @@ -5300,67 +5300,67 @@ Installed 37 packages in 193ms - + - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + - + - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + +