{"ts": "2025-11-10T21:58:55Z", "run": "3078eb3fdd0247809e806075fdf7ee85", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H8_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07309099999019963, "p50": 0.07444199997053147, "p90": 0.07482099999833736, "mean": 0.07456319998482286, "iqr": 0.00039000002516331733, "raw_times": [0.07443099997317404, 0.0760309999918718, 0.07482099999833736, 0.07444199997053147, 0.07309099999019963], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08203099997672325, "peak_bytes": 3178496, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.590452924915553e-08, "mae_k": 1.5487040982975486e-08, "mse_q": 2.5241010080938753e-15, "mse_k": 2.364223539299626e-15, "ref": "rotary_torch"}, "err": null} {"ts": "2025-11-10T21:58:55Z", "run": "3078eb3fdd0247809e806075fdf7ee85", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H8_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09060200000021723, "p50": 0.09103200000026845, "p90": 0.09151199998314041, "mean": 0.09118959999341314, "iqr": 0.0008709999974598759, "raw_times": [0.09060200000021723, 0.09216099999775906, 0.09103200000026845, 0.09151199998314041, 0.09064099998568054], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09373199998208293, "peak_bytes": 6356992, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5508486939097565e-08, "mae_k": 1.567566698668088e-08, "mse_q": 2.3630110116356316e-15, "mse_k": 2.416562128626943e-15, "ref": "rotary_torch"}, "err": null} {"ts": "2025-11-10T21:58:55Z", "run": "3078eb3fdd0247809e806075fdf7ee85", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H32_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08705100003680855, "p50": 0.0875320000091051, "p90": 0.08769100003291896, "mean": 0.08876720002035654, "iqr": 0.0002699999868127634, "raw_times": [0.0874210000461062, 0.09414099997684389, 0.08705100003680855, 0.08769100003291896, 0.0875320000091051], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09279099998593665, "peak_bytes": 12615680, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5856898016863852e-08, "mae_k": 1.572981211950264e-08, "mse_q": 2.4771055025978386e-15, "mse_k": 2.4544071371937915e-15, "ref": "rotary_torch"}, "err": null} {"ts": "2025-11-10T21:58:55Z", "run": "3078eb3fdd0247809e806075fdf7ee85", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H32_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08560100002341642, "p50": 0.08801100000255246, "p90": 0.08860200000526675, "mean": 0.08908540002039445, "iqr": 0.0012109999829590379, "raw_times": [0.08739100002230771, 0.08801100000255246, 0.08860200000526675, 0.09582200004842889, 0.08560100002341642], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09329199997409887, "peak_bytes": 25231360, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5617658277733426e-08, "mae_k": 1.5788685914230882e-08, "mse_q": 2.4549424620164562e-15, "mse_k": 2.492823469483563e-15, "ref": "rotary_torch"}, "err": null} {"ts": "2025-11-10T21:58:55Z", "run": "3078eb3fdd0247809e806075fdf7ee85", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H8_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08852199999864752, "p50": 0.08866100000659571, "p90": 0.08963200002654048, "mean": 0.08911940001326002, "iqr": 0.0010109999948326731, "raw_times": [0.08963200002654048, 0.08862100003170781, 0.08852199999864752, 0.08866100000659571, 0.09016100000280858], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0921010000070055, "peak_bytes": 12779520, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5962712041073246e-08, "mae_k": 1.5743363945830424e-08, "mse_q": 2.534145124782417e-15, "mse_k": 2.451281585618423e-15, "ref": "rotary_torch"}, "err": null} {"ts": "2025-11-10T21:58:55Z", "run": "3078eb3fdd0247809e806075fdf7ee85", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H8_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08753199995226169, "p50": 0.08826099997349957, "p90": 0.08928100004368389, "mean": 0.08895959999790648, "iqr": 0.001079000014669873, "raw_times": [0.08753199995226169, 0.08928100004368389, 0.08820200002901402, 0.09152199999107324, 0.08826099997349957], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0922809999792662, "peak_bytes": 25427968, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.578730035589615e-08, "mae_k": 1.5859711766097462e-08, "mse_q": 2.440287521479536e-15, "mse_k": 2.477901290051784e-15, "ref": "rotary_torch"}, "err": null} {"ts": "2025-11-10T21:58:55Z", "run": "3078eb3fdd0247809e806075fdf7ee85", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H32_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08700099999714439, "p50": 0.08810100001710452, "p90": 0.08876099997223719, "mean": 0.08815519998961463, "iqr": 0.0012099999935344385, "raw_times": [0.08700099999714439, 0.0893619999828843, 0.08876099997223719, 0.08810100001710452, 0.08755099997870275], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09151099999371581, "peak_bytes": 50462720, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5775295736375483e-08, "mae_k": 1.5847881229547056e-08, "mse_q": 2.471039476146077e-15, "mse_k": 2.472378635235686e-15, "ref": "rotary_torch"}, "err": null} {"ts": "2025-11-10T21:58:55Z", "run": "3078eb3fdd0247809e806075fdf7ee85", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H32_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08636100000103397, "p50": 0.08706099998789796, "p90": 0.0880219999999099, "mean": 0.08728360000986868, "iqr": 0.0015599999869664316, "raw_times": [0.08646200001294346, 0.0885120000475581, 0.0880219999999099, 0.08706099998789796, 0.08636100000103397], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09124100000690305, "peak_bytes": 100925440, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5959869870130206e-08, "mae_k": 1.588083975434529e-08, "mse_q": 2.510663677418633e-15, "mse_k": 2.502786271009168e-15, "ref": "rotary_torch"}, "err": null} {"ts": "2025-11-10T21:58:55Z", "run": "3078eb3fdd0247809e806075fdf7ee85", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H8_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08730199999718025, "p50": 0.08869100003039421, "p90": 0.09006199996974829, "mean": 0.08888559999604695, "iqr": 0.0023309999619414157, "raw_times": [0.08773100000780687, 0.09064199997510514, 0.09006199996974829, 0.08730199999718025, 0.08869100003039421], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09268100001236235, "peak_bytes": 51118080, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5894533689220225e-08, "mae_k": 1.5873395042831362e-08, "mse_q": 2.5093181655819197e-15, "mse_k": 2.488611809911578e-15, "ref": "rotary_torch"}, "err": null} {"ts": "2025-11-10T21:58:55Z", "run": "3078eb3fdd0247809e806075fdf7ee85", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H8_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08675099996935387, "p50": 0.08854100002508858, "p90": 0.08863200002906524, "mean": 0.08941120000827141, "iqr": 0.00029099999210302485, "raw_times": [0.09479099998088714, 0.08863200002906524, 0.08854100002508858, 0.08675099996935387, 0.08834100003696221], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09256099997401179, "peak_bytes": 101711872, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5936768349433805e-08, "mae_k": 1.5960043953100467e-08, "mse_q": 2.51039008577667e-15, "mse_k": 2.5111253103748867e-15, "ref": "rotary_torch"}, "err": null} {"ts": "2025-11-10T21:58:55Z", "run": "3078eb3fdd0247809e806075fdf7ee85", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H32_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08763099998532198, "p50": 0.08916199999475793, "p90": 0.08947200001330202, "mean": 0.08909940000876304, "iqr": 0.000891000013325538, "raw_times": [0.08947200001330202, 0.08763099998532198, 0.08916199999475793, 0.09065100005045679, 0.08858099999997648], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0931619999846589, "peak_bytes": 201850880, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 1.9073486328125e-06, "absmax_k": 9.5367431640625e-07, "mae_q": 1.586510300910504e-08, "mae_k": 1.5813935050346117e-08, "mse_q": 2.499836478770355e-15, "mse_k": 2.4755639026338358e-15, "ref": "rotary_torch"}, "err": null} {"ts": "2025-11-10T21:58:55Z", "run": "3078eb3fdd0247809e806075fdf7ee85", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H32_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2592540000136978, "p50": 0.2617740000232516, "p90": 0.2619539999955123, "mean": 0.2612200000157827, "iqr": 0.0011099999710495467, "raw_times": [0.2617740000232516, 0.2622740000219892, 0.2619539999955123, 0.26084400002446273, 0.2592540000136978], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2616440000338116, "peak_bytes": 403701760, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.581049247079136e-08, "mae_k": 1.5861061797295406e-08, "mse_q": 2.4735094242202705e-15, "mse_k": 2.486832828964107e-15, "ref": "rotary_torch"}, "err": null} {"ts": "2025-11-10T21:58:55Z", "run": "3078eb3fdd0247809e806075fdf7ee85", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H8_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08504199996650641, "p50": 0.08663200003411475, "p90": 0.0882019999721706, "mean": 0.08694359999026346, "iqr": 0.0022109999804342806, "raw_times": [0.08663200003411475, 0.0882019999721706, 0.08885099998678925, 0.08599099999173632, 0.08504199996650641], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08975200000804762, "peak_bytes": 137396224, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5824980437173508e-08, "mae_k": 1.5615324144846454e-08, "mse_q": 2.488090249374306e-15, "mse_k": 2.425079044911585e-15, "ref": "rotary_torch"}, "err": null} {"ts": "2025-11-10T21:58:56Z", "run": "3078eb3fdd0247809e806075fdf7ee85", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H8_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0891509999974005, "p50": 0.08992100003979431, "p90": 0.09012199996050185, "mean": 0.0899451999998746, "iqr": 0.0002709999762373627, "raw_times": [0.08985099998426449, 0.09068100001741186, 0.09012199996050185, 0.08992100003979431, 0.0891509999974005], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09275200000047334, "peak_bytes": 12648448, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5683587761827766e-08, "mae_k": 1.574532682013796e-08, "mse_q": 2.4310271220254415e-15, "mse_k": 2.4601385856313877e-15, "ref": "rotary_torch"}, "err": null} {"ts": "2025-11-10T21:58:56Z", "run": "3078eb3fdd0247809e806075fdf7ee85", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H32_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08789200001046993, "p50": 0.08992099998295089, "p90": 0.0902720000226509, "mean": 0.09012159999883806, "iqr": 0.0012010000318696257, "raw_times": [0.08789200001046993, 0.08907099999078127, 0.09345199998733733, 0.0902720000226509, 0.08992099998295089], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09202100000038627, "peak_bytes": 25198592, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5835009747888762e-08, "mae_k": 1.572560215379326e-08, "mse_q": 2.478222950813504e-15, "mse_k": 2.4541699679685603e-15, "ref": "rotary_torch"}, "err": null} {"ts": "2025-11-10T21:58:56Z", "run": "3078eb3fdd0247809e806075fdf7ee85", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H32_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08713099998658436, "p50": 0.08851199999071468, "p90": 0.08962200001860765, "mean": 0.09088959999417057, "iqr": 0.0023510000346504967, "raw_times": [0.08713099998658436, 0.08851199999071468, 0.08962200001860765, 0.08727099998395715, 0.10191199999098899], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08909199999607154, "peak_bytes": 50397184, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5876850056883995e-08, "mae_k": 1.5927410501603845e-08, "mse_q": 2.504224532953606e-15, "mse_k": 2.503892919554756e-15, "ref": "rotary_torch"}, "err": null} {"ts": "2025-11-10T21:58:56Z", "run": "3078eb3fdd0247809e806075fdf7ee85", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H8_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08660100002089166, "p50": 0.08889200000794517, "p90": 0.08962200001860765, "mean": 0.08841560002110782, "iqr": 0.002391000009538402, "raw_times": [0.08889200000794517, 0.08660100002089166, 0.08723100000906925, 0.08962200001860765, 0.08973200004902537], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.10579199999938282, "peak_bytes": 25362432, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5820052823301012e-08, "mae_k": 1.580205122309053e-08, "mse_q": 2.4876468276264184e-15, "mse_k": 2.4866062476507165e-15, "ref": "rotary_torch"}, "err": null} {"ts": "2025-11-10T21:58:56Z", "run": "3078eb3fdd0247809e806075fdf7ee85", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H8_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08826199996292416, "p50": 0.08903100001589337, "p90": 0.0892219999855115, "mean": 0.0892053999905329, "iqr": 0.0008609999895270448, "raw_times": [0.08826199996292416, 0.09115099999235099, 0.0892219999855115, 0.08836099999598446, 0.08903100001589337], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09266099999649668, "peak_bytes": 50593792, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5823172105911e-08, "mae_k": 1.582038855474366e-08, "mse_q": 2.464257071579175e-15, "mse_k": 2.4775099608301526e-15, "ref": "rotary_torch"}, "err": null} {"ts": "2025-11-10T21:58:56Z", "run": "3078eb3fdd0247809e806075fdf7ee85", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H32_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08734199997206815, "p50": 0.0891519999868251, "p90": 0.09024100000942781, "mean": 0.0889337999979034, "iqr": 0.002558999995017075, "raw_times": [0.08734199997206815, 0.09024100000942781, 0.09025200000678524, 0.0891519999868251, 0.08768200001441073], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09219200001098216, "peak_bytes": 100794368, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5888783622131086e-08, "mae_k": 1.5861886026868888e-08, "mse_q": 2.4766798685418433e-15, "mse_k": 2.475923891636419e-15, "ref": "rotary_torch"}, "err": null} {"ts": "2025-11-10T21:58:56Z", "run": "3078eb3fdd0247809e806075fdf7ee85", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H32_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08766199999854507, "p50": 0.0882019999721706, "p90": 0.08848099997749159, "mean": 0.08829339999465446, "iqr": 0.0005199999577598646, "raw_times": [0.08766199999854507, 0.08848099997749159, 0.08916100000533334, 0.08796100001973173, 0.0882019999721706], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0894309999921461, "peak_bytes": 201588736, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5826390864503992e-08, "mae_k": 1.5792682717119533e-08, "mse_q": 2.480465258783123e-15, "mse_k": 2.475580631534544e-15, "ref": "rotary_torch"}, "err": null} {"ts": "2025-11-10T21:58:56Z", "run": "3078eb3fdd0247809e806075fdf7ee85", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H8_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08695100001432365, "p50": 0.08826100003034298, "p90": 0.08882200000925877, "mean": 0.08816519999754746, "iqr": 0.001351000037175254, "raw_times": [0.08747099997208352, 0.08882200000925877, 0.08826100003034298, 0.08695100001432365, 0.08932099996172838], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09112100002539592, "peak_bytes": 101449728, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.592899323554775e-08, "mae_k": 1.5925031959795888e-08, "mse_q": 2.50783882253954e-15, "mse_k": 2.5015648494992274e-15, "ref": "rotary_torch"}, "err": null} {"ts": "2025-11-10T21:58:56Z", "run": "3078eb3fdd0247809e806075fdf7ee85", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H8_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08768100002498613, "p50": 0.08872199998677388, "p90": 0.08966100000407096, "mean": 0.09043360000760003, "iqr": 0.0018989999830409943, "raw_times": [0.09834200000113924, 0.08966100000407096, 0.08872199998677388, 0.08768100002498613, 0.08776200002102996], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08996200000410681, "peak_bytes": 202375168, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.590209919299923e-08, "mae_k": 1.590130160877834e-08, "mse_q": 2.4971026799330918e-15, "mse_k": 2.506967649153289e-15, "ref": "rotary_torch"}, "err": null} {"ts": "2025-11-10T21:58:56Z", "run": "3078eb3fdd0247809e806075fdf7ee85", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H32_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2556240000330945, "p50": 0.2579839999725664, "p90": 0.2584439999964161, "mean": 0.258233800002472, "iqr": 0.0005109999960950518, "raw_times": [0.2579839999725664, 0.2584439999964161, 0.2556240000330945, 0.2611840000099619, 0.25793300000032104], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2541540000038367, "peak_bytes": 403177472, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5847520629108658e-08, "mae_k": 1.5862454461057496e-08, "mse_q": 2.4917348203881045e-15, "mse_k": 2.491306009958557e-15, "ref": "rotary_torch"}, "err": null} {"ts": "2025-11-10T21:58:56Z", "run": "3078eb3fdd0247809e806075fdf7ee85", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H32_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8455130000015743, "p50": 0.8465030000479601, "p90": 0.850922999973136, "mean": 0.8485591999942699, "iqr": 0.005059999978129781, "raw_times": [0.850922999973136, 0.8465030000479601, 0.8458629999950062, 0.8455130000015743, 0.8539939999536728], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8586040000295725, "peak_bytes": 806354944, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.585225106737198e-08, "mae_k": 1.581303976649906e-08, "mse_q": 2.4866460581992374e-15, "mse_k": 2.4721545950211372e-15, "ref": "rotary_torch"}, "err": null}