diff --git a/activation/impls/artifacts/benchmark/activation.jsonl b/activation/impls/artifacts/benchmark/activation.jsonl index 3e006de96bb2daedf135e24fcb717a0479a9b199..7d1a3c651f4f5824ec6a49a44a98836bdbadbb74 100644 --- a/activation/impls/artifacts/benchmark/activation.jsonl +++ b/activation/impls/artifacts/benchmark/activation.jsonl @@ -1,9 +1,9 @@ -{"ts": "2025-10-29T15:50:51Z", "run": "e54f22d2514045f0929111d26589784b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.040432000048440386, "p50": 0.04165099994679622, "p90": 0.0417410000181917, "mean": 0.04172699999571705, "iqr": 0.0011400000516914588, "raw_times": [0.0417410000181917, 0.04420999999865671, 0.040432000048440386, 0.04165099994679622, 0.04060099996650024], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.046430999987023824, "peak_bytes": 1966080, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} -{"ts": "2025-10-29T15:50:51Z", "run": "e54f22d2514045f0929111d26589784b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04963099996757592, "p50": 0.05265099997586731, "p90": 0.053851000018312334, "mean": 0.054568999985349365, "iqr": 0.0016500000583619112, "raw_times": [0.04963099996757592, 0.05265099997586731, 0.05220099995995042, 0.053851000018312334, 0.06451100000504084], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05472100002634761, "peak_bytes": 2621440, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} -{"ts": "2025-10-29T15:50:51Z", "run": "e54f22d2514045f0929111d26589784b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04966099999137441, "p50": 0.05102099999021448, "p90": 0.05103099999814731, "mean": 0.05151719999503257, "iqr": 0.0007099999947968172, "raw_times": [0.04966099999137441, 0.05555199999207616, 0.05032100000335049, 0.05102099999021448, 0.05103099999814731], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05423200002496742, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} -{"ts": "2025-10-29T15:50:51Z", "run": "e54f22d2514045f0929111d26589784b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04886099998202553, "p50": 0.05024199998615586, "p90": 0.0503609999782384, "mean": 0.05005519998348973, "iqr": 0.0007900000014160469, "raw_times": [0.04886099998202553, 0.04957099997682235, 0.051240999994206504, 0.05024199998615586, 0.0503609999782384], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.053871000034177996, "peak_bytes": 3932160, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} -{"ts": "2025-10-29T15:50:51Z", "run": "e54f22d2514045f0929111d26589784b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04914099997677113, "p50": 0.04985100002841136, "p90": 0.05049099996767836, "mean": 0.04988699998875745, "iqr": 0.0013399999829744047, "raw_times": [0.04915099998470396, 0.05080099998622245, 0.04985100002841136, 0.04914099997677113, 0.05049099996767836], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.053920999960155314, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} -{"ts": "2025-10-29T15:50:51Z", "run": "e54f22d2514045f0929111d26589784b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04656100003330721, "p50": 0.04960100000062084, "p90": 0.05333199999313365, "mean": 0.05254540001260466, "iqr": 0.0039209999727063405, "raw_times": [0.04656100003330721, 0.05333199999313365, 0.04960100000062084, 0.04941100002042731, 0.06382200001553429], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.051971000004868984, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} -{"ts": "2025-10-29T15:50:51Z", "run": "e54f22d2514045f0929111d26589784b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04889099994898061, "p50": 0.050290999979552, "p90": 0.05037099998617123, "mean": 0.05047499996635452, "iqr": 0.0002600000357233512, "raw_times": [0.04889099994898061, 0.052710999966620875, 0.050110999950447876, 0.05037099998617123, 0.050290999979552], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05234200000359124, "peak_bytes": 7864320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} -{"ts": "2025-10-29T15:50:51Z", "run": "e54f22d2514045f0929111d26589784b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0489209999727791, "p50": 0.04973099999006081, "p90": 0.05078099997035679, "mean": 0.051391199974659685, "iqr": 0.0012099999935344385, "raw_times": [0.0489209999727791, 0.05078099997035679, 0.04973099999006081, 0.04957099997682235, 0.05795199996327938], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0512020000087432, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} -{"ts": "2025-10-29T15:50:51Z", "run": "e54f22d2514045f0929111d26589784b", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04852099999652637, "p50": 0.04917100000056962, "p90": 0.049370999988695985, "mean": 0.049055200008751854, "iqr": 0.0007299999538190605, "raw_times": [0.04852099999652637, 0.048641000034876924, 0.04917100000056962, 0.049370999988695985, 0.04957200002309037], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05309099998385136, "peak_bytes": 20971520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-30T15:52:35Z", "run": "37230f1ce31641f2b6ebd7aba7f793c9", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.022969999974975508, "p50": 0.023499999997511622, "p90": 0.023961000010785938, "mean": 0.02361460000201987, "iqr": 0.0009899999895424116, "raw_times": [0.022971000021243526, 0.022969999974975508, 0.023961000010785938, 0.023499999997511622, 0.024671000005582755], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03073999999969601, "peak_bytes": 1966080, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-30T15:52:35Z", "run": "37230f1ce31641f2b6ebd7aba7f793c9", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.027540000019143918, "p50": 0.029130999962490023, "p90": 0.03002100004323438, "mean": 0.029014800009008468, "iqr": 0.0016900000332498166, "raw_times": [0.027540000019143918, 0.030051000010189455, 0.03002100004323438, 0.029130999962490023, 0.028331000009984564], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03343999998151048, "peak_bytes": 2621440, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-30T15:52:35Z", "run": "37230f1ce31641f2b6ebd7aba7f793c9", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02829999999676147, "p50": 0.029119999965132592, "p90": 0.03051000004461457, "mean": 0.029939999990347133, "iqr": 0.0019500000689731678, "raw_times": [0.02829999999676147, 0.03051000004461457, 0.033209999969585624, 0.029119999965132592, 0.028559999975641404], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031761000002461515, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-30T15:52:35Z", "run": "37230f1ce31641f2b6ebd7aba7f793c9", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.027531000000635686, "p50": 0.028170999996746104, "p90": 0.028501000031155854, "mean": 0.028293000002577173, "iqr": 0.0008900000239009387, "raw_times": [0.027611000007254916, 0.028170999996746104, 0.029650999977093306, 0.027531000000635686, 0.028501000031155854], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03130000004603062, "peak_bytes": 3932160, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-30T15:52:35Z", "run": "37230f1ce31641f2b6ebd7aba7f793c9", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02755000002707675, "p50": 0.02861000001530556, "p90": 0.028831000008722185, "mean": 0.02867660001584227, "iqr": 0.00023000001192485797, "raw_times": [0.028600999996797327, 0.029791000031309522, 0.028831000008722185, 0.02755000002707675, 0.02861000001530556], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03139000000373926, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-30T15:52:35Z", "run": "37230f1ce31641f2b6ebd7aba7f793c9", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02618100000972845, "p50": 0.027131000024382956, "p90": 0.02731099999664366, "mean": 0.026918799994746223, "iqr": 0.0007610000238855719, "raw_times": [0.02618100000972845, 0.027131000024382956, 0.027420999970217963, 0.02731099999664366, 0.026549999972758087], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03008099997714453, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-30T15:52:35Z", "run": "37230f1ce31641f2b6ebd7aba7f793c9", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.026950999995278835, "p50": 0.02748000002839035, "p90": 0.02804100000730614, "mean": 0.02758480000011332, "iqr": 0.0006300000450210064, "raw_times": [0.026950999995278835, 0.02804100000730614, 0.027410999962285132, 0.02804100000730614, 0.02748000002839035], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03104999996139668, "peak_bytes": 7864320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-30T15:52:35Z", "run": "37230f1ce31641f2b6ebd7aba7f793c9", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.026300000001810986, "p50": 0.02733100001250932, "p90": 0.0275399999623005, "mean": 0.02720039998393986, "iqr": 0.0004789999934473599, "raw_times": [0.02706099996885314, 0.02733100001250932, 0.027769999974225357, 0.0275399999623005, 0.026300000001810986], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03032000000757762, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-30T15:52:35Z", "run": "37230f1ce31641f2b6ebd7aba7f793c9", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02642100002958614, "p50": 0.027860999978202017, "p90": 0.02790100000993334, "mean": 0.027615000010428048, "iqr": 0.00036000000136482413, "raw_times": [0.02642100002958614, 0.028351000025850226, 0.027541000008568517, 0.02790100000993334, 0.027860999978202017], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03163999997468636, "peak_bytes": 20971520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} diff --git a/activation/impls/cells/benchmark.py b/activation/impls/cells/benchmark.py index 711af9e01652ef5081b507affd0f7df9ac99e644..04f9df27c14acf429b58dba6cf0677c00cbbbced 100644 --- a/activation/impls/cells/benchmark.py +++ b/activation/impls/cells/benchmark.py @@ -4,6 +4,7 @@ # "numpy", # "torch==2.8.0", # "kernels-benchmark-tools", +# "kernels", # ] # # [tool.uv.sources] @@ -12,17 +13,22 @@ import torch import sys from kernels_benchmark_tools import KernelTypeEnum, run_benchmark -import torch, torch.nn.functional as F +from kernels import get_kernel +# Load the activation kernel +activation = get_kernel("kernels-community/activation") -def swiglu_eager(x): - d = x.shape[-1] // 2 - return F.silu(x[..., :d]) * x[..., d:] + +def hf_kernels_swiglu(input_tensor): + hidden_dim = input_tensor.shape[-1] // 2 + out_shape = input_tensor.shape[:-1] + (hidden_dim,) + out = torch.empty(out_shape, dtype=input_tensor.dtype, device=input_tensor.device) + return activation.silu_and_mul(out, input_tensor) run_benchmark( kernel_type=KernelTypeEnum.ACTIVATION, - impl_name="torch_eager", - impl_tags={"family":"hf-kernels", "backend":"eager"}, - impl_func=swiglu_eager, + impl_name="hf_kernels_swiglu", + impl_tags={"family": "hf-kernels", "backend": "cuda"}, + impl_func=hf_kernels_swiglu, ) \ No newline at end of file diff --git a/activation/impls/hf_kernels_swiglu.html b/activation/impls/hf_kernels_swiglu.html index 8384c1a7290b7dc38496729333c6a6825ebff89b..cb9b44ffb1c09312b21e7b7e432e9d78fbf6e49d 100644 --- a/activation/impls/hf_kernels_swiglu.html +++ b/activation/impls/hf_kernels_swiglu.html @@ -4,6 +4,11 @@