diff --git a/activation/impls/artifacts/benchmark/activation.jsonl b/activation/impls/artifacts/benchmark/activation.jsonl
index 5cb1d2e7a3df4383b17555d3c5513bbb6d567a4e..93cac36bf4f689de57400a82e22b49cf0344ff7b 100644
--- a/activation/impls/artifacts/benchmark/activation.jsonl
+++ b/activation/impls/artifacts/benchmark/activation.jsonl
@@ -1,9 +1,9 @@
-{"ts": "2025-10-28T14:08:47Z", "run": "c851494cf96a4119be3e14911bb5592b", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02110099990204617, "p50": 0.022570000055566197, "p90": 0.02266100000269944, "mean": 0.022242599993660406, "iqr": 0.0007410000080199097, "raw_times": [0.022570000055566197, 0.022961000013310695, 0.02191999999467953, 0.02266100000269944, 0.02110099990204617], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.02889100005631917, "peak_bytes": 1966080, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
-{"ts": "2025-10-28T14:08:47Z", "run": "c851494cf96a4119be3e14911bb5592b", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02585100003216212, "p50": 0.02831100005096232, "p90": 0.02854100000604376, "mean": 0.02791500000967062, "iqr": 0.0013400000398178236, "raw_times": [0.02585100003216212, 0.02854100000604376, 0.02967099999295897, 0.02831100005096232, 0.027200999966225936], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031750999937685265, "peak_bytes": 2621440, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
-{"ts": "2025-10-28T14:08:47Z", "run": "c851494cf96a4119be3e14911bb5592b", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02804099995046272, "p50": 0.028271000019230996, "p90": 0.02853099999811093, "mean": 0.032097199982672464, "iqr": 0.0004900000476482091, "raw_times": [0.04760199999509496, 0.028271000019230996, 0.02853099999811093, 0.02804099995046272, 0.02804099995046272], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031132000003708526, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
-{"ts": "2025-10-28T14:08:47Z", "run": "c851494cf96a4119be3e14911bb5592b", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02513000004000787, "p50": 0.027131000024382956, "p90": 0.027909999971598154, "mean": 0.027204600019103964, "iqr": 0.0014589999182135216, "raw_times": [0.02513000004000787, 0.027131000024382956, 0.027909999971598154, 0.029401000006146205, 0.026451000053384632], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.030690999892613036, "peak_bytes": 3932160, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
-{"ts": "2025-10-28T14:08:47Z", "run": "c851494cf96a4119be3e14911bb5592b", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02570000003743189, "p50": 0.026741000056063058, "p90": 0.02731099993980024, "mean": 0.02703079999264446, "iqr": 0.0012099999366910197, "raw_times": [0.02570000003743189, 0.02731099993980024, 0.029300999926817894, 0.02610100000310922, 0.026741000056063058], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.030331000061778468, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
-{"ts": "2025-10-28T14:08:47Z", "run": "c851494cf96a4119be3e14911bb5592b", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.025050999965969822, "p50": 0.026220999984616356, "p90": 0.028031000056216726, "mean": 0.026778999995258346, "iqr": 0.0018400000953988638, "raw_times": [0.025050999965969822, 0.026190999960817862, 0.026220999984616356, 0.028031000056216726, 0.028401000008670962], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031100999990485434, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
-{"ts": "2025-10-28T14:08:47Z", "run": "c851494cf96a4119be3e14911bb5592b", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02494000000297092, "p50": 0.026971000011144497, "p90": 0.02789099994515709, "mean": 0.027030599972022173, "iqr": 0.0009699999736767495, "raw_times": [0.02494000000297092, 0.026971000011144497, 0.02789099994515709, 0.02842999992935802, 0.02692099997148034], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.029161000043131935, "peak_bytes": 7864320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
-{"ts": "2025-10-28T14:08:47Z", "run": "c851494cf96a4119be3e14911bb5592b", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.024340999971173005, "p50": 0.02594099998987076, "p90": 0.027440999929240206, "mean": 0.026286999968760938, "iqr": 0.0016499999446750735, "raw_times": [0.024340999971173005, 0.027920999968955584, 0.027440999929240206, 0.02594099998987076, 0.025790999984565133], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.02797100000861974, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
-{"ts": "2025-10-28T14:08:47Z", "run": "c851494cf96a4119be3e14911bb5592b", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.025551000021550863, "p50": 0.026880999939749017, "p90": 0.028271000019230996, "mean": 0.027656800011754967, "iqr": 0.002240999947389355, "raw_times": [0.025551000021550863, 0.026880999939749017, 0.02603000007184164, 0.03155100000640232, 0.028271000019230996], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.02960100005111599, "peak_bytes": 20971520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02251099999739381, "p50": 0.02324100000805629, "p90": 0.023539999972399528, "mean": 0.023146399996676337, "iqr": 0.0007499999696847226, "raw_times": [0.023539999972399528, 0.022790000002714805, 0.02324100000805629, 0.02365000000281725, 0.02251099999739381], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.029810000000907166, "peak_bytes": 1966080, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.027370999987397227, "p50": 0.028240999995432503, "p90": 0.028329999963716546, "mean": 0.02825879998908931, "iqr": 0.00023899997358967084, "raw_times": [0.028090999990126875, 0.028240999995432503, 0.028329999963716546, 0.029261000008773408, 0.027370999987397227], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03212000001440174, "peak_bytes": 2621440, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02641099996480989, "p50": 0.027520999992702855, "p90": 0.028440999983558868, "mean": 0.027734599996165343, "iqr": 0.001440999938040477, "raw_times": [0.02641099996480989, 0.028440999983558868, 0.029299999994236714, 0.027520999992702855, 0.02700000004551839], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.032080999972095015, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.026321000007101247, "p50": 0.02724099999795726, "p90": 0.028659999998126295, "mean": 0.02923079999845868, "iqr": 0.0014990000067882647, "raw_times": [0.026321000007101247, 0.03677099999777056, 0.028659999998126295, 0.02716099999133803, 0.02724099999795726], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031121000006351096, "peak_bytes": 3932160, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.025979999975334067, "p50": 0.028520999990178098, "p90": 0.028720999978304462, "mean": 0.027810800008865044, "iqr": 0.00169999992749581, "raw_times": [0.025979999975334067, 0.028811000049699942, 0.028520999990178098, 0.027021000050808652, 0.028720999978304462], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.02976100000751103, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.026359999992564553, "p50": 0.027051000017763727, "p90": 0.027101000000584463, "mean": 0.027004599996871548, "iqr": 0.00035100003970001126, "raw_times": [0.027101000000584463, 0.027051000017763727, 0.027761000012560544, 0.026749999960884452, 0.026359999992564553], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.029620999953294813, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02576100001761006, "p50": 0.027530000011211087, "p90": 0.02828099997032041, "mean": 0.0273743999969156, "iqr": 0.001340999972399004, "raw_times": [0.02576100001761006, 0.02828099997032041, 0.026939999997921404, 0.02835999998751504, 0.027530000011211087], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.030121000008875853, "peak_bytes": 7864320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.025459999960730784, "p50": 0.028590999988864496, "p90": 0.02870100001928222, "mean": 0.027812799999082927, "iqr": 0.00113999999484804, "raw_times": [0.025459999960730784, 0.02870100001928222, 0.028751000002102955, 0.02756100002443418, 0.028590999988864496], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.029950999987704563, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:26Z", "run": "cb02b80f11f04022890ac7026c401c57", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02632999996876606, "p50": 0.027500999976837193, "p90": 0.028640000039104052, "mean": 0.028318399995441723, "iqr": 0.0021100000253682083, "raw_times": [0.02632999996876606, 0.03259099997876547, 0.027500999976837193, 0.026530000013735844, 0.028640000039104052], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.029991000019435887, "peak_bytes": 20971520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
diff --git a/activation/impls/hf_kernels_swiglu.html b/activation/impls/hf_kernels_swiglu.html
index 0333e61899bfbfb799696bb358236ac894538ab4..0ee10cb621cd4a8fa09e449aade63a5a1449d022 100644
--- a/activation/impls/hf_kernels_swiglu.html
+++ b/activation/impls/hf_kernels_swiglu.html
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
 <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> | 
-Cell: nv | 0.21s
+Cell: nv | 0.26s
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
 <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3887,7 +3887,7 @@ Cell: nv | 0.21s
 </div>
 </div>
 <div id="output-nv" class="cell-output">
-<div class="cell-stdout"><pre class="stdout-text">Tue Oct 28 14:07:54 2025       
+<div class="cell-stdout"><pre class="stdout-text">Wed Oct 29 14:26:44 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -3896,7 +3896,7 @@ Cell: nv | 0.21s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   27C    P0             80W /  350W |       0MiB /  46068MiB |      1%      Default |
+| N/A   32C    P0            133W /  350W |       0MiB /  46068MiB |    100%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3920,7 +3920,7 @@ Cell: nv | 0.21s
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
 </span> | 
-Cell: benchmark | 4.26s
+Cell: benchmark | 4.19s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3976,17 +3976,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D768
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      70.944us      1745.67%      70.944us      70.944us             1  
-                                      hf_kernels_swiglu        10.31%     179.916us        99.57%       1.738ms       1.738ms       0.000us         0.00%       5.472us       5.472us             1  
-                      _activation_beeaae6::silu_and_mul         1.09%      18.951us        86.60%       1.512ms     503.911us       4.064us       100.00%       5.472us       1.824us             3  
-void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       4.064us       100.00%       4.064us       1.355us             3  
-                                Activity Buffer Request        83.12%       1.451ms        83.12%       1.451ms       1.451ms       1.408us        34.65%       1.408us       1.408us             1  
-                                            aten::empty         2.66%      46.432us         2.66%      46.432us      15.477us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         2.39%      41.801us         2.39%      41.801us      13.934us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.43%       7.500us         0.43%       7.500us       7.500us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      72.288us      1807.20%      72.288us      72.288us             1  
+                                      hf_kernels_swiglu        12.07%     211.387us        99.59%       1.744ms       1.744ms       0.000us         0.00%       5.376us       5.376us             1  
+                      _activation_beeaae6::silu_and_mul         1.10%      19.319us        84.87%       1.486ms     495.368us       4.000us       100.00%       5.376us       1.792us             3  
+void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       4.000us       100.00%       4.000us       1.333us             3  
+                                Activity Buffer Request        81.49%       1.427ms        81.49%       1.427ms       1.427ms       1.376us        34.40%       1.376us       1.376us             1  
+                                            aten::empty         2.64%      46.231us         2.64%      46.231us      15.410us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         2.28%      39.911us         2.28%      39.911us      13.304us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.41%       7.220us         0.41%       7.220us       7.220us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.746ms
-Self CUDA time total: 4.064us
+Self CPU time total: 1.751ms
+Self CUDA time total: 4.000us
 
 
 
@@ -3996,17 +3996,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      68.703us      1759.36%      68.703us      68.703us             1  
-                                      hf_kernels_swiglu         6.60%     109.215us        99.70%       1.650ms       1.650ms       0.000us         0.00%       5.217us       5.217us             1  
-                      _activation_beeaae6::silu_and_mul         1.44%      23.760us        91.91%       1.521ms     506.927us       3.905us       100.00%       5.217us       1.739us             3  
-void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       3.905us       100.00%       3.905us       1.302us             3  
-                                Activity Buffer Request        88.83%       1.470ms        88.83%       1.470ms       1.470ms       1.312us        33.60%       1.312us       1.312us             1  
-                                            aten::empty         1.19%      19.640us         1.19%      19.640us       6.547us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         1.65%      27.251us         1.65%      27.251us       9.084us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.30%       4.941us         0.30%       4.941us       4.941us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      62.686us      1579.79%      62.686us      62.686us             1  
+                                      hf_kernels_swiglu         6.72%     108.943us        99.67%       1.616ms       1.616ms       0.000us         0.00%       5.312us       5.312us             1  
+                      _activation_beeaae6::silu_and_mul         1.34%      21.721us        91.77%       1.488ms     495.875us       3.968us       100.00%       5.312us       1.771us             3  
+void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       3.968us       100.00%       3.968us       1.323us             3  
+                                Activity Buffer Request        88.82%       1.440ms        88.82%       1.440ms       1.440ms       1.344us        33.87%       1.344us       1.344us             1  
+                                            aten::empty         1.18%      19.150us         1.18%      19.150us       6.383us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.61%      26.150us         1.61%      26.150us       8.717us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.33%       5.310us         0.33%       5.310us       5.310us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.655ms
-Self CUDA time total: 3.905us
+Self CPU time total: 1.621ms
+Self CUDA time total: 3.968us
 
 
 
@@ -4016,16 +4016,16 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      67.999us      1388.58%      67.999us      67.999us             1  
-                                      hf_kernels_swiglu         6.71%     113.524us        99.73%       1.687ms       1.687ms       0.000us         0.00%       6.529us       6.529us             1  
-                      _activation_beeaae6::silu_and_mul         1.26%      21.380us        91.91%       1.555ms     518.231us       4.897us       100.00%       6.529us       2.176us             3  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      66.687us      1361.79%      66.687us      66.687us             1  
+                                      hf_kernels_swiglu         6.74%     109.943us        99.70%       1.626ms       1.626ms       0.000us         0.00%       6.529us       6.529us             1  
+                      _activation_beeaae6::silu_and_mul         1.25%      20.459us        91.78%       1.496ms     498.816us       4.897us       100.00%       6.529us       2.176us             3  
 void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       4.897us       100.00%       4.897us       1.632us             3  
-                                Activity Buffer Request        89.08%       1.507ms        89.08%       1.507ms       1.507ms       1.632us        33.33%       1.632us       1.632us             1  
-                                            aten::empty         1.11%      18.802us         1.11%      18.802us       6.267us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         1.56%      26.371us         1.56%      26.371us       8.790us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.27%       4.571us         0.27%       4.571us       4.571us       0.000us         0.00%       0.000us       0.000us             1  
+                                Activity Buffer Request        88.91%       1.450ms        88.91%       1.450ms       1.450ms       1.632us        33.33%       1.632us       1.632us             1  
+                                            aten::empty         1.18%      19.260us         1.18%      19.260us       6.420us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.61%      26.232us         1.61%      26.232us       8.744us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.30%       4.870us         0.30%       4.870us       4.870us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.692ms
+Self CPU time total: 1.631ms
 Self CUDA time total: 4.897us
 
 
@@ -4036,16 +4036,16 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D768
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      63.776us      1498.50%      63.776us      63.776us             1  
-                                      hf_kernels_swiglu         5.54%      99.283us        99.75%       1.788ms       1.788ms       0.000us         0.00%       5.696us       5.696us             1  
-                      _activation_beeaae6::silu_and_mul         1.20%      21.550us        93.21%       1.671ms     556.862us       4.256us       100.00%       5.696us       1.899us             3  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      66.081us      1552.66%      66.081us      66.081us             1  
+                                      hf_kernels_swiglu         6.15%     108.423us        99.71%       1.758ms       1.758ms       0.000us         0.00%       5.696us       5.696us             1  
+                      _activation_beeaae6::silu_and_mul         1.25%      22.001us        92.49%       1.631ms     543.697us       4.256us       100.00%       5.696us       1.899us             3  
 void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       4.256us       100.00%       4.256us       1.419us             3  
-                                Activity Buffer Request        79.15%       1.419ms        79.15%       1.419ms       1.419ms       1.440us        33.83%       1.440us       1.440us             1  
-                                            aten::empty         1.00%      17.972us         1.00%      17.972us       5.991us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        12.85%     230.398us        12.85%     230.398us      76.799us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.25%       4.510us         0.25%       4.510us       4.510us       0.000us         0.00%       0.000us       0.000us             1  
+                                Activity Buffer Request        80.93%       1.427ms        80.93%       1.427ms       1.427ms       1.440us        33.83%       1.440us       1.440us             1  
+                                            aten::empty         1.07%      18.910us         1.07%      18.910us       6.303us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        10.31%     181.874us        10.31%     181.874us      60.625us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.29%       5.110us         0.29%       5.110us       5.110us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.792ms
+Self CPU time total: 1.764ms
 Self CUDA time total: 4.256us
 
 
@@ -4056,17 +4056,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      62.431us      1060.31%      62.431us      62.431us             1  
-                                      hf_kernels_swiglu        20.17%      83.914us        98.89%     411.305us     411.305us       0.000us         0.00%       7.872us       7.872us             1  
-                      _activation_beeaae6::silu_and_mul         5.09%      21.171us        74.40%     309.470us     103.157us       5.888us       100.00%       7.872us       2.624us             3  
-void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       5.888us       100.00%       5.888us       1.963us             3  
-                                Activity Buffer Request        32.60%     135.614us        32.60%     135.614us     135.614us       1.984us        33.70%       1.984us       1.984us             1  
-                                            aten::empty         4.31%      17.921us         4.31%      17.921us       5.974us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        36.71%     152.685us        36.71%     152.685us      50.895us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.11%       4.631us         1.11%       4.631us       4.631us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      63.167us      1072.63%      63.167us      63.167us             1  
+                                      hf_kernels_swiglu        15.22%      87.332us        99.19%     569.294us     569.294us       0.000us         0.00%       7.873us       7.873us             1  
+                      _activation_beeaae6::silu_and_mul         3.58%      20.570us        80.67%     463.002us     154.334us       5.889us       100.00%       7.873us       2.624us             3  
+void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       5.889us       100.00%       5.889us       1.963us             3  
+                                Activity Buffer Request        48.76%     279.877us        48.76%     279.877us     279.877us       1.984us        33.69%       1.984us       1.984us             1  
+                                            aten::empty         3.30%      18.960us         3.30%      18.960us       6.320us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        28.32%     162.555us        28.32%     162.555us      54.185us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.81%       4.660us         0.81%       4.660us       4.660us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 415.936us
-Self CUDA time total: 5.888us
+Self CPU time total: 573.954us
+Self CUDA time total: 5.889us
 
 
 
@@ -4076,16 +4076,16 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      67.615us       880.40%      67.615us      67.615us             1  
-                                      hf_kernels_swiglu         5.97%     103.444us        99.74%       1.727ms       1.727ms       0.000us         0.00%      10.240us      10.240us             1  
-                      _activation_beeaae6::silu_and_mul         1.23%      21.310us        92.70%       1.605ms     535.135us       7.680us       100.00%      10.240us       3.413us             3  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      69.632us       906.67%      69.632us      69.632us             1  
+                                      hf_kernels_swiglu         6.07%     107.484us        99.73%       1.766ms       1.766ms       0.000us         0.00%      10.240us      10.240us             1  
+                      _activation_beeaae6::silu_and_mul         1.19%      21.010us        92.55%       1.639ms     546.413us       7.680us       100.00%      10.240us       3.413us             3  
 void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       7.680us       100.00%       7.680us       2.560us             3  
-                                Activity Buffer Request        82.79%       1.434ms        82.79%       1.434ms       1.434ms       2.560us        33.33%       2.560us       2.560us             1  
-                                            aten::empty         1.07%      18.611us         1.07%      18.611us       6.204us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         8.68%     150.305us         8.68%     150.305us      50.102us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.26%       4.450us         0.26%       4.450us       4.450us       0.000us         0.00%       0.000us       0.000us             1  
+                                Activity Buffer Request        81.69%       1.447ms        81.69%       1.447ms       1.447ms       2.560us        33.33%       2.560us       2.560us             1  
+                                            aten::empty         1.11%      19.720us         1.11%      19.720us       6.573us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         9.67%     171.234us         9.67%     171.234us      57.078us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.27%       4.800us         0.27%       4.800us       4.800us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.732ms
+Self CPU time total: 1.771ms
 Self CUDA time total: 7.680us
 
 
@@ -4096,17 +4096,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D768
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      63.423us       962.12%      63.423us      63.423us             1  
-                                      hf_kernels_swiglu         5.71%      97.705us        99.74%       1.706ms       1.706ms       0.000us         0.00%       8.800us       8.800us             1  
-                      _activation_beeaae6::silu_and_mul         1.25%      21.440us        92.96%       1.590ms     530.071us       6.592us       100.00%       8.800us       2.933us             3  
-void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       6.592us       100.00%       6.592us       2.197us             3  
-                                Activity Buffer Request        82.94%       1.419ms        82.94%       1.419ms       1.419ms       2.208us        33.50%       2.208us       2.208us             1  
-                                            aten::empty         1.07%      18.230us         1.07%      18.230us       6.077us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         8.77%     149.945us         8.77%     149.945us      49.982us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.26%       4.450us         0.26%       4.450us       4.450us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      72.064us      1098.54%      72.064us      72.064us             1  
+                                      hf_kernels_swiglu         6.19%     109.521us        99.72%       1.763ms       1.763ms       0.000us         0.00%       8.768us       8.768us             1  
+                      _activation_beeaae6::silu_and_mul         1.22%      21.580us        92.43%       1.635ms     544.850us       6.560us       100.00%       8.768us       2.923us             3  
+void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       6.560us       100.00%       6.560us       2.187us             3  
+                                Activity Buffer Request        81.92%       1.449ms        81.92%       1.449ms       1.449ms       2.208us        33.66%       2.208us       2.208us             1  
+                                            aten::empty         1.09%      19.351us         1.09%      19.351us       6.450us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         9.29%     164.205us         9.29%     164.205us      54.735us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.28%       4.990us         0.28%       4.990us       4.990us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.711ms
-Self CUDA time total: 6.592us
+Self CPU time total: 1.768ms
+Self CUDA time total: 6.560us
 
 
 
@@ -4116,17 +4116,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      61.982us       658.89%      61.982us      61.982us             1  
-                                      hf_kernels_swiglu        22.04%      82.603us        98.77%     370.213us     370.213us       0.000us         0.00%      12.543us      12.543us             1  
-                      _activation_beeaae6::silu_and_mul         5.90%      22.112us        71.72%     268.830us      89.610us       9.407us       100.00%      12.543us       4.181us             3  
-void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       9.407us       100.00%       9.407us       3.136us             3  
-                                Activity Buffer Request        26.16%      98.063us        26.16%      98.063us      98.063us       3.136us        33.34%       3.136us       3.136us             1  
-                                            aten::empty         5.01%      18.780us         5.01%      18.780us       6.260us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        39.66%     148.655us        39.66%     148.655us      49.552us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.23%       4.600us         1.23%       4.600us       4.600us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      65.118us       692.16%      65.118us      65.118us             1  
+                                      hf_kernels_swiglu        16.62%      89.683us        99.03%     534.374us     534.374us       0.000us         0.00%      12.576us      12.576us             1  
+                      _activation_beeaae6::silu_and_mul         3.96%      21.372us        78.99%     426.201us     142.067us       9.408us       100.00%      12.576us       4.192us             3  
+void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       9.408us       100.00%       9.408us       3.136us             3  
+                                Activity Buffer Request        44.61%     240.735us        44.61%     240.735us     240.735us       3.168us        33.67%       3.168us       3.168us             1  
+                                            aten::empty         3.43%      18.490us         3.43%      18.490us       6.163us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        30.41%     164.094us        30.41%     164.094us      54.698us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.97%       5.210us         0.97%       5.210us       5.210us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 374.813us
-Self CUDA time total: 9.407us
+Self CPU time total: 539.584us
+Self CUDA time total: 9.408us
 
 
 
@@ -4136,17 +4136,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      63.776us       490.85%      63.776us      63.776us             1  
-                                      hf_kernels_swiglu        24.11%      99.284us        98.97%     407.515us     407.515us       0.000us         0.00%      17.346us      17.346us             1  
-                      _activation_beeaae6::silu_and_mul         5.19%      21.351us        70.31%     289.510us      96.503us      12.993us       100.00%      17.346us       5.782us             3  
-void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us      12.993us       100.00%      12.993us       4.331us             3  
-                                Activity Buffer Request        28.96%     119.264us        28.96%     119.264us     119.264us       4.353us        33.50%       4.353us       4.353us             1  
-                                            aten::empty         4.55%      18.721us         4.55%      18.721us       6.240us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        36.16%     148.895us        36.16%     148.895us      49.632us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.03%       4.240us         1.03%       4.240us       4.240us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      69.182us       527.34%      69.182us      69.182us             1  
+                                      hf_kernels_swiglu        12.86%     103.214us        99.41%     797.800us     797.800us       0.000us         0.00%      17.534us      17.534us             1  
+                      _activation_beeaae6::silu_and_mul         2.63%      21.139us        84.20%     675.726us     225.242us      13.119us       100.00%      17.534us       5.845us             3  
+void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us      13.119us       100.00%      13.119us       4.373us             3  
+                                Activity Buffer Request        61.21%     491.232us        61.21%     491.232us     491.232us       4.415us        33.65%       4.415us       4.415us             1  
+                                            aten::empty         2.35%      18.860us         2.35%      18.860us       6.287us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        20.35%     163.355us        20.35%     163.355us      54.452us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.59%       4.750us         0.59%       4.750us       4.750us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 411.755us
-Self CUDA time total: 12.993us
+Self CPU time total: 802.550us
+Self CUDA time total: 13.119us
 
 
 impl                     wl                  p50(ms)  ok
@@ -4163,13 +4163,12 @@ hf_kernels_swiglu        cuda_T512_D768         0.03  True
 <div class="uv-install-logs" id="uv-logs-benchmark">
 <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 <div class="uv-logs-content" style="display: none;">
-Installed 15 packages in 14ms
+Installed 15 packages in 13ms
 </div>
 </div>
 <div class="cell-stderr">Fetching 7 files:   0%|          | 0/7 [00:00&lt;?, ?it/s]
-Fetching 7 files:  14%|█▍        | 1/7 [00:00&lt;00:00,  7.79it/s]
-Fetching 7 files:  71%|███████▏  | 5/7 [00:00&lt;00:00, 11.48it/s]
-Fetching 7 files: 100%|██████████| 7/7 [00:00&lt;00:00, 15.62it/s]</div>
+Fetching 7 files:  71%|███████▏  | 5/7 [00:00&lt;00:00, 14.29it/s]
+Fetching 7 files: 100%|██████████| 7/7 [00:00&lt;00:00, 19.98it/s]</div>
 <div class="cell-artifacts">
 <h4>Artifacts:</h4>
 <a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
diff --git a/activation/impls/torch_swiglu.html b/activation/impls/torch_swiglu.html
index e0544d7b368c13222c83ebad4ecbb275fed41e18..6e53efa4229f749d46be9ca846a20dfeed1ecd5d 100644
--- a/activation/impls/torch_swiglu.html
+++ b/activation/impls/torch_swiglu.html
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
 <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> | 
-Cell: nv | 0.21s
+Cell: nv | 0.26s
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
 <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3887,7 +3887,7 @@ Cell: nv | 0.21s
 </div>
 </div>
 <div id="output-nv" class="cell-output">
-<div class="cell-stdout"><pre class="stdout-text">Tue Oct 28 14:07:54 2025       
+<div class="cell-stdout"><pre class="stdout-text">Wed Oct 29 14:26:44 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -3896,7 +3896,7 @@ Cell: nv | 0.21s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   27C    P0             80W /  350W |       0MiB /  46068MiB |      1%      Default |
+| N/A   32C    P0            133W /  350W |       0MiB /  46068MiB |    100%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3920,7 +3920,7 @@ Cell: nv | 0.21s
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
 </span> | 
-Cell: benchmark | 6.88s
+Cell: benchmark | 6.86s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3970,20 +3970,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D768
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     206.526us      1621.34%     206.526us     206.526us             1  
-                                            torch_eager        11.16%     213.167us        99.55%       1.902ms       1.902ms       0.000us         0.00%      15.042us      15.042us             1  
-                                             aten::silu         3.29%      62.892us        81.79%       1.563ms     520.961us       6.529us        51.26%       8.833us       2.944us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.529us        51.26%       6.529us       2.176us             3  
-                                              aten::mul         2.06%      39.382us         3.23%      61.724us      20.575us       6.209us        48.74%       6.209us       2.070us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.209us        48.74%       6.209us       2.070us             3  
-                                Activity Buffer Request        76.05%       1.453ms        76.05%       1.453ms       1.453ms       2.304us        18.09%       2.304us       2.304us             1  
-                                            aten::slice         2.72%      51.931us         3.38%      64.581us      10.764us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.66%      12.650us         0.66%      12.650us       2.108us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         3.62%      69.144us         3.62%      69.144us      11.524us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.45%       8.521us         0.45%       8.521us       8.521us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     189.470us      1483.94%     189.470us     189.470us             1  
+                                            torch_eager        11.64%     220.727us        99.60%       1.889ms       1.889ms       0.000us         0.00%      15.103us      15.103us             1  
+                                             aten::silu         3.36%      63.732us        81.84%       1.552ms     517.326us       6.559us        51.37%       8.894us       2.965us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.559us        51.37%       6.559us       2.186us             3  
+                                              aten::mul         1.83%      34.608us         3.05%      57.780us      19.260us       6.209us        48.63%       6.209us       2.070us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.209us        48.63%       6.209us       2.070us             3  
+                                Activity Buffer Request        76.17%       1.444ms        76.17%       1.444ms       1.444ms       2.335us        18.29%       2.335us       2.335us             1  
+                                            aten::slice         2.47%      46.790us         3.07%      58.281us       9.714us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.61%      11.491us         0.61%      11.491us       1.915us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         3.54%      67.043us         3.54%      67.043us      11.174us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.40%       7.531us         0.40%       7.531us       7.531us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.911ms
-Self CUDA time total: 12.738us
+Self CPU time total: 1.896ms
+Self CUDA time total: 12.768us
 
 
 
@@ -3993,20 +3993,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     151.965us      1227.21%     151.965us     151.965us             1  
-                                            torch_eager         7.02%     119.974us        99.63%       1.704ms       1.704ms       0.000us         0.00%      14.558us      14.558us             1  
-                                             aten::silu         2.35%      40.140us        88.12%       1.507ms     502.320us       6.399us        51.68%       8.574us       2.858us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     160.895us      1299.43%     160.895us     160.895us             1  
+                                            torch_eager         6.82%     117.243us        99.71%       1.713ms       1.713ms       0.000us         0.00%      14.558us      14.558us             1  
+                                             aten::silu         2.46%      42.340us        88.23%       1.516ms     505.362us       6.399us        51.68%       8.575us       2.858us             3  
 void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.399us        51.68%       6.399us       2.133us             3  
-                                              aten::mul         1.61%      27.481us         2.72%      46.541us      15.514us       5.984us        48.32%       5.984us       1.995us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.984us        48.32%       5.984us       1.995us             3  
-                                Activity Buffer Request        84.14%       1.439ms        84.14%       1.439ms       1.439ms       2.175us        17.56%       2.175us       2.175us             1  
-                                            aten::slice         1.43%      24.471us         1.78%      30.412us       5.069us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.35%       5.941us         0.35%       5.941us       0.990us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         2.74%      46.851us         2.74%      46.851us       7.809us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.37%       6.320us         0.37%       6.320us       6.320us       0.000us         0.00%       0.000us       0.000us             1  
+                                              aten::mul         1.64%      28.101us         2.83%      48.681us      16.227us       5.983us        48.32%       5.983us       1.994us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.983us        48.32%       5.983us       1.994us             3  
+                                Activity Buffer Request        84.10%       1.445ms        84.10%       1.445ms       1.445ms       2.176us        17.57%       2.176us       2.176us             1  
+                                            aten::slice         1.47%      25.252us         1.82%      31.222us       5.204us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.35%       5.970us         0.35%       5.970us       0.995us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         2.87%      49.290us         2.87%      49.290us       8.215us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.29%       5.020us         0.29%       5.020us       5.020us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.710ms
-Self CUDA time total: 12.383us
+Self CPU time total: 1.718ms
+Self CUDA time total: 12.382us
 
 
 
@@ -4016,20 +4016,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     151.008us      1139.77%     151.008us     151.008us             1  
-                                            torch_eager         6.34%     107.173us        99.70%       1.687ms       1.687ms       0.000us         0.00%      15.522us      15.522us             1  
-                                             aten::silu         2.38%      40.332us        88.83%       1.503ms     500.911us       6.817us        51.45%       9.090us       3.030us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.817us        51.45%       6.817us       2.272us             3  
-                                              aten::mul         1.57%      26.503us         2.73%      46.253us      15.418us       6.432us        48.55%       6.432us       2.144us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.432us        48.55%       6.432us       2.144us             3  
-                                Activity Buffer Request        84.91%       1.436ms        84.91%       1.436ms       1.436ms       2.273us        17.16%       2.273us       2.273us             1  
-                                            aten::slice         1.43%      24.250us         1.81%      30.550us       5.092us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.37%       6.300us         0.37%       6.300us       1.050us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         2.70%      45.731us         2.70%      45.731us       7.622us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.30%       5.000us         0.30%       5.000us       5.000us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     157.982us      1195.38%     157.982us     157.982us             1  
+                                            torch_eager         6.51%     110.244us        99.65%       1.686ms       1.686ms       0.000us         0.00%      15.488us      15.488us             1  
+                                             aten::silu         2.52%      42.653us        88.50%       1.498ms     499.192us       6.784us        51.33%       9.056us       3.019us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.784us        51.33%       6.784us       2.261us             3  
+                                              aten::mul         1.66%      28.021us         2.76%      46.791us      15.597us       6.432us        48.67%       6.432us       2.144us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.432us        48.67%       6.432us       2.144us             3  
+                                Activity Buffer Request        84.30%       1.427ms        84.30%       1.427ms       1.427ms       2.272us        17.19%       2.272us       2.272us             1  
+                                            aten::slice         1.51%      25.627us         1.87%      31.700us       5.283us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.36%       6.073us         0.36%       6.073us       1.012us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         2.78%      47.050us         2.78%      47.050us       7.842us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.35%       5.950us         0.35%       5.950us       5.950us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
 Self CPU time total: 1.692ms
-Self CUDA time total: 13.249us
+Self CUDA time total: 13.216us
 
 
 
@@ -4039,20 +4039,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D768
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     153.149us      1202.68%     153.149us     153.149us             1  
-                                            torch_eager         6.34%     109.104us        99.71%       1.717ms       1.717ms       0.000us         0.00%      14.941us      14.941us             1  
-                                             aten::silu         2.38%      40.982us        88.93%       1.531ms     510.411us       6.558us        51.50%       8.765us       2.922us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.558us        51.50%       6.558us       2.186us             3  
-                                              aten::mul         1.52%      26.241us         2.68%      46.222us      15.407us       6.176us        48.50%       6.176us       2.059us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.176us        48.50%       6.176us       2.059us             3  
-                                Activity Buffer Request        73.41%       1.264ms        73.41%       1.264ms       1.264ms       2.207us        17.33%       2.207us       2.207us             1  
-                                            aten::slice         1.43%      24.560us         1.77%      30.400us       5.067us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.34%       5.840us         0.34%       5.840us       0.973us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        14.29%     246.139us        14.29%     246.139us      41.023us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.29%       4.920us         0.29%       4.920us       4.920us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     159.902us      1258.67%     159.902us     159.902us             1  
+                                            torch_eager         6.73%     114.317us        99.66%       1.694ms       1.694ms       0.000us         0.00%      14.912us      14.912us             1  
+                                             aten::silu         2.46%      41.881us        88.34%       1.501ms     500.465us       6.560us        51.64%       8.768us       2.923us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.560us        51.64%       6.560us       2.187us             3  
+                                              aten::mul         1.68%      28.581us         2.79%      47.441us      15.814us       6.144us        48.36%       6.144us       2.048us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.144us        48.36%       6.144us       2.048us             3  
+                                Activity Buffer Request        74.33%       1.263ms        74.33%       1.263ms       1.263ms       2.208us        17.38%       2.208us       2.208us             1  
+                                            aten::slice         1.44%      24.468us         1.80%      30.638us       5.106us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.36%       6.170us         0.36%       6.170us       1.028us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        12.65%     214.994us        12.65%     214.994us      35.832us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.34%       5.830us         0.34%       5.830us       5.830us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.722ms
-Self CUDA time total: 12.734us
+Self CPU time total: 1.700ms
+Self CUDA time total: 12.704us
 
 
 
@@ -4062,20 +4062,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     149.310us      1126.87%     149.310us     149.310us             1  
-                                            torch_eager         5.88%     107.113us        99.73%       1.817ms       1.817ms       0.000us         0.00%      15.555us      15.555us             1  
-                                             aten::silu         2.34%      42.602us        89.83%       1.636ms     545.432us       6.785us        51.21%       9.090us       3.030us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.785us        51.21%       6.785us       2.262us             3  
-                                              aten::mul         1.33%      24.312us         2.33%      42.512us      14.171us       6.465us        48.79%       6.465us       2.155us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.465us        48.79%       6.465us       2.155us             3  
-                                Activity Buffer Request        78.20%       1.424ms        78.20%       1.424ms       1.424ms       2.305us        17.40%       2.305us       2.305us             1  
-                                            aten::slice         1.35%      24.650us         1.68%      30.660us       5.110us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.33%       6.010us         0.33%       6.010us       1.002us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        10.29%     187.406us        10.29%     187.406us      31.234us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.27%       4.950us         0.27%       4.950us       4.950us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     157.053us      1185.48%     157.053us     157.053us             1  
+                                            torch_eager         6.08%     111.294us        99.69%       1.824ms       1.824ms       0.000us         0.00%      15.552us      15.552us             1  
+                                             aten::silu         2.39%      43.729us        89.42%       1.636ms     545.306us       6.784us        51.21%       9.088us       3.029us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.784us        51.21%       6.784us       2.261us             3  
+                                              aten::mul         1.44%      26.361us         2.52%      46.181us      15.394us       6.464us        48.79%       6.464us       2.155us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.464us        48.79%       6.464us       2.155us             3  
+                                Activity Buffer Request        77.97%       1.426ms        77.97%       1.426ms       1.426ms       2.304us        17.39%       2.304us       2.304us             1  
+                                            aten::slice         1.34%      24.571us         1.66%      30.441us       5.074us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.32%       5.870us         0.32%       5.870us       0.978us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.14%     185.544us        10.14%     185.544us      30.924us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.31%       5.601us         0.31%       5.601us       5.601us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.822ms
-Self CUDA time total: 13.250us
+Self CPU time total: 1.829ms
+Self CUDA time total: 13.248us
 
 
 
@@ -4085,20 +4085,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     143.804us       924.73%     143.804us     143.804us             1  
-                                            torch_eager        21.50%     103.524us        99.01%     476.736us     476.736us       0.000us         0.00%      18.271us      18.271us             1  
-                                             aten::silu         8.70%      41.893us        62.70%     301.891us     100.630us       7.999us        51.44%      10.719us       3.573us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.999us        51.44%       7.999us       2.666us             3  
-                                              aten::mul         5.07%      24.390us         8.83%      42.521us      14.174us       7.552us        48.56%       7.552us       2.517us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.552us        48.56%       7.552us       2.517us             3  
-                                Activity Buffer Request        22.22%     106.973us        22.22%     106.973us     106.973us       2.720us        17.49%       2.720us       2.720us             1  
-                                            aten::slice         4.80%      23.090us         5.98%      28.800us       4.800us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         1.19%       5.710us         1.19%       5.710us       0.952us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        35.55%     171.156us        35.55%     171.156us      28.526us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.99%       4.760us         0.99%       4.760us       4.760us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     151.390us       977.47%     151.390us     151.390us             1  
+                                            torch_eager        22.03%     109.975us        99.02%     494.363us     494.363us       0.000us         0.00%      18.176us      18.176us             1  
+                                             aten::silu         8.41%      41.971us        61.88%     308.937us     102.979us       7.936us        51.24%      10.624us       3.541us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.936us        51.24%       7.936us       2.645us             3  
+                                              aten::mul         5.23%      26.101us         8.92%      44.531us      14.844us       7.552us        48.76%       7.552us       2.517us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.552us        48.76%       7.552us       2.517us             3  
+                                Activity Buffer Request        22.19%     110.773us        22.19%     110.773us     110.773us       2.688us        17.36%       2.688us       2.688us             1  
+                                            aten::slice         5.05%      25.220us         6.19%      30.920us       5.153us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         1.14%       5.700us         1.14%       5.700us       0.950us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        34.98%     174.623us        34.98%     174.623us      29.104us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.98%       4.900us         0.98%       4.900us       4.900us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 481.496us
-Self CUDA time total: 15.551us
+Self CPU time total: 499.263us
+Self CUDA time total: 15.488us
 
 
 
@@ -4108,20 +4108,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D768
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     153.372us      1067.46%     153.372us     153.372us             1  
-                                            torch_eager         5.96%     108.164us        99.73%       1.810ms       1.810ms       0.000us         0.00%      16.832us      16.832us             1  
-                                             aten::silu         2.30%      41.731us        89.59%       1.626ms     541.925us       7.360us        51.22%       9.824us       3.275us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.360us        51.22%       7.360us       2.453us             3  
-                                              aten::mul         1.41%      25.542us         2.47%      44.792us      14.931us       7.008us        48.78%       7.008us       2.336us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.008us        48.78%       7.008us       2.336us             3  
-                                Activity Buffer Request        78.82%       1.430ms        78.82%       1.430ms       1.430ms       2.464us        17.15%       2.464us       2.464us             1  
-                                            aten::slice         1.37%      24.840us         1.70%      30.900us       5.150us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.33%       6.060us         0.33%       6.060us       1.010us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         9.53%     172.976us         9.53%     172.976us      28.829us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.27%       4.960us         0.27%       4.960us       4.960us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     163.583us      1143.70%     163.583us     163.583us             1  
+                                            torch_eager         6.28%     116.052us        99.70%       1.841ms       1.841ms       0.000us         0.00%      16.767us      16.767us             1  
+                                             aten::silu         2.27%      41.942us        89.09%       1.645ms     548.450us       7.327us        51.23%       9.791us       3.264us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.327us        51.23%       7.327us       2.442us             3  
+                                              aten::mul         1.55%      28.681us         2.62%      48.392us      16.131us       6.976us        48.77%       6.976us       2.325us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.976us        48.77%       6.976us       2.325us             3  
+                                Activity Buffer Request        78.22%       1.445ms        78.22%       1.445ms       1.445ms       2.464us        17.23%       2.464us       2.464us             1  
+                                            aten::slice         1.38%      25.430us         1.70%      31.392us       5.232us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.32%       5.962us         0.32%       5.962us       0.994us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         9.67%     178.614us         9.67%     178.614us      29.769us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.30%       5.570us         0.30%       5.570us       5.570us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.815ms
-Self CUDA time total: 14.368us
+Self CPU time total: 1.847ms
+Self CUDA time total: 14.303us
 
 
 
@@ -4131,20 +4131,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     146.240us       942.27%     146.240us     146.240us             1  
-                                            torch_eager        22.59%     104.486us        98.96%     457.726us     457.726us       0.000us         0.00%      18.208us      18.208us             1  
-                                             aten::silu         8.78%      40.590us        60.43%     279.519us      93.173us       7.936us        51.13%      10.624us       3.541us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.936us        51.13%       7.936us       2.645us             3  
-                                              aten::mul         5.53%      25.579us         9.45%      43.730us      14.577us       7.584us        48.87%       7.584us       2.528us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.584us        48.87%       7.584us       2.528us             3  
-                                Activity Buffer Request        18.85%      87.193us        18.85%      87.193us      87.193us       2.688us        17.32%       2.688us       2.688us             1  
-                                            aten::slice         5.23%      24.201us         6.48%      29.991us       4.999us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         1.25%       5.790us         1.25%       5.790us       0.965us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        36.73%     169.887us        36.73%     169.887us      28.314us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         1.04%       4.800us         1.04%       4.800us       4.800us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     150.172us       969.60%     150.172us     150.172us             1  
+                                            torch_eager        23.07%     110.204us        98.98%     472.752us     472.752us       0.000us         0.00%      18.176us      18.176us             1  
+                                             aten::silu         9.08%      43.371us        60.20%     287.547us      95.849us       7.936us        51.24%      10.624us       3.541us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.936us        51.24%       7.936us       2.645us             3  
+                                              aten::mul         5.48%      26.181us         9.38%      44.801us      14.934us       7.552us        48.76%       7.552us       2.517us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.552us        48.76%       7.552us       2.517us             3  
+                                Activity Buffer Request        19.26%      92.002us        19.26%      92.002us      92.002us       2.688us        17.36%       2.688us       2.688us             1  
+                                            aten::slice         5.00%      23.870us         6.32%      30.200us       5.033us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         1.33%       6.330us         1.33%       6.330us       1.055us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        35.76%     170.794us        35.76%     170.794us      28.466us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         1.02%       4.871us         1.02%       4.871us       4.871us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 462.526us
-Self CUDA time total: 15.520us
+Self CPU time total: 477.623us
+Self CUDA time total: 15.488us
 
 
 
@@ -4154,20 +4154,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     181.470us       803.28%     181.470us     181.470us             1  
-                                            torch_eager         5.97%     109.125us        99.74%       1.823ms       1.823ms       0.000us         0.00%      26.526us      26.526us             1  
-                                             aten::silu         2.38%      43.492us        88.50%       1.617ms     539.072us      11.647us        51.56%      15.582us       5.194us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      11.647us        51.56%      11.647us       3.882us             3  
-                                              aten::mul         1.42%      25.882us         3.51%      64.123us      21.374us      10.944us        48.44%      10.944us       3.648us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.944us        48.44%      10.944us       3.648us             3  
-                                Activity Buffer Request        77.67%       1.419ms        77.67%       1.419ms       1.419ms       3.935us        17.42%       3.935us       3.935us             1  
-                                            aten::slice         1.42%      25.910us         1.76%      32.089us       5.348us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.34%       6.179us         0.34%       6.179us       1.030us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        10.54%     192.606us        10.54%     192.606us      32.101us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.26%       4.790us         0.26%       4.790us       4.790us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     160.000us       713.30%     160.000us     160.000us             1  
+                                            torch_eager         5.99%     109.975us        99.73%       1.831ms       1.831ms       0.000us         0.00%      26.335us      26.335us             1  
+                                             aten::silu         2.30%      42.230us        89.52%       1.643ms     547.763us      11.583us        51.64%      15.487us       5.162us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      11.583us        51.64%      11.583us       3.861us             3  
+                                              aten::mul         1.54%      28.250us         2.52%      46.180us      15.393us      10.848us        48.36%      10.848us       3.616us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.848us        48.36%      10.848us       3.616us             3  
+                                Activity Buffer Request        78.83%       1.447ms        78.83%       1.447ms       1.447ms       3.904us        17.40%       3.904us       3.904us             1  
+                                            aten::slice         1.37%      25.211us         1.70%      31.261us       5.210us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.33%       6.050us         0.33%       6.050us       1.008us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         9.37%     171.964us         9.37%     171.964us      28.661us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.27%       4.930us         0.27%       4.930us       4.930us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.827ms
-Self CUDA time total: 22.591us
+Self CPU time total: 1.836ms
+Self CUDA time total: 22.431us
 
 
 impl                     wl                  p50(ms)  ok
@@ -4184,7 +4184,7 @@ torch_eager              cuda_T512_D768         0.05  True
 <div class="uv-install-logs" id="uv-logs-benchmark">
 <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 <div class="uv-logs-content" style="display: none;">
-Installed 37 packages in 192ms
+Installed 37 packages in 230ms
 </div>
 </div>
 <div class="cell-artifacts">
diff --git a/activation/results/artifacts/combine/latency.svg b/activation/results/artifacts/combine/latency.svg
index 2eb6d36da2a386c6f3b7ffe7a4f2ecf07fbe531d..b809b51f58837145ae3fdbcb04aa1aec4a5e023e 100644
--- a/activation/results/artifacts/combine/latency.svg
+++ b/activation/results/artifacts/combine/latency.svg
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:431dea6a591fc822f7d0d0d6f793e8c11170edb647c627b5a44ad9883df2c3fc
-size 20697
+oid sha256:f62c7d85fc4a76cf7a1060a62df99ff0d32133ab94bb502b68dcd53171c39602
+size 21424
diff --git a/activation/results/combined_results.html b/activation/results/combined_results.html
index f11a4ea4cf1c2f2bfbc419d5616f99db4990e15c..35064093e9085dbed21e2edd8a0a4e6c497bbb9d 100644
--- a/activation/results/combined_results.html
+++ b/activation/results/combined_results.html
@@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
   <rdf:RDF>
    <ns2:Work>
     <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
-    <dc:date>2025-10-28T14:09:13.211569</dc:date>
+    <dc:date>2025-10-29T14:27:49.999657</dc:date>
     <dc:format>image/svg+xml</dc:format>
     <dc:creator>
      <ns2:Agent>
@@ -4021,83 +4021,96 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
    <g id="matplotlib.axis_2">
     <g id="ytick_1">
      <g id="grid-y--2" class="grid grid-y">
-      <path d="M 60.23 416.825206  L 847.294169 416.825206  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 60.23 428.188156  L 847.294169 428.188156  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_10">
       <defs>
        <path id="m0fca2865ba" d="M 0 0  L -3.5 0  " style="stroke: #000000; stroke-width: 0.8" />
       </defs>
       <g>
-       <use ns4:href="#m0fca2865ba" x="60.23" y="416.825206" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="60.23" y="428.188156" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_10">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="420.624425" transform="rotate(-0 53.23 420.624425)">0.025</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="431.987375" transform="rotate(-0 53.23 431.987375)">0.025</text>
      </g>
     </g>
     <g id="ytick_2">
      <g id="grid-y--3" class="grid grid-y">
-      <path d="M 60.23 346.161452  L 847.294169 346.161452  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 60.23 362.86799  L 847.294169 362.86799  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_11">
       <g>
-       <use ns4:href="#m0fca2865ba" x="60.23" y="346.161452" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="60.23" y="362.86799" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_11">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="349.960671" transform="rotate(-0 53.23 349.960671)">0.030</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="366.667209" transform="rotate(-0 53.23 366.667209)">0.030</text>
      </g>
     </g>
     <g id="ytick_3">
      <g id="grid-y--4" class="grid grid-y">
-      <path d="M 60.23 275.497698  L 847.294169 275.497698  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 60.23 297.547824  L 847.294169 297.547824  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_12">
       <g>
-       <use ns4:href="#m0fca2865ba" x="60.23" y="275.497698" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="60.23" y="297.547824" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_12">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="279.296917" transform="rotate(-0 53.23 279.296917)">0.035</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="301.347043" transform="rotate(-0 53.23 301.347043)">0.035</text>
      </g>
     </g>
     <g id="ytick_4">
      <g id="grid-y--5" class="grid grid-y">
-      <path d="M 60.23 204.833944  L 847.294169 204.833944  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 60.23 232.227658  L 847.294169 232.227658  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_13">
       <g>
-       <use ns4:href="#m0fca2865ba" x="60.23" y="204.833944" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="60.23" y="232.227658" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_13">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="208.633163" transform="rotate(-0 53.23 208.633163)">0.040</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="236.026877" transform="rotate(-0 53.23 236.026877)">0.040</text>
      </g>
     </g>
     <g id="ytick_5">
      <g id="grid-y--6" class="grid grid-y">
-      <path d="M 60.23 134.170191  L 847.294169 134.170191  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 60.23 166.907492  L 847.294169 166.907492  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_14">
       <g>
-       <use ns4:href="#m0fca2865ba" x="60.23" y="134.170191" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="60.23" y="166.907492" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_14">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="137.969409" transform="rotate(-0 53.23 137.969409)">0.045</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="170.706711" transform="rotate(-0 53.23 170.706711)">0.045</text>
      </g>
     </g>
     <g id="ytick_6">
      <g id="grid-y--7" class="grid grid-y">
-      <path d="M 60.23 63.506437  L 847.294169 63.506437  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 60.23 101.587327  L 847.294169 101.587327  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_15">
       <g>
-       <use ns4:href="#m0fca2865ba" x="60.23" y="63.506437" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="60.23" y="101.587327" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_15">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="67.305655" transform="rotate(-0 53.23 67.305655)">0.050</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="105.386545" transform="rotate(-0 53.23 105.386545)">0.050</text>
+     </g>
+    </g>
+    <g id="ytick_7">
+     <g id="grid-y--8" class="grid grid-y">
+      <path d="M 60.23 36.267161  L 847.294169 36.267161  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_16">
+      <g>
+       <use ns4:href="#m0fca2865ba" x="60.23" y="36.267161" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_16">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="40.066379" transform="rotate(-0 53.23 40.066379)">0.055</text>
      </g>
     </g>
     <g id="label--y" class="ylabel">
@@ -4105,37 +4118,37 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
     </g>
    </g>
    <g id="series--hf-kernels-swiglu" class="series">
-    <path d="M 96.005644 451.16779  L 185.444754 370.031668  L 274.883864 370.596978  L 364.322974 386.708314  L 453.762084 392.220086  L 543.201194 399.569118  L 632.640304 388.969554  L 722.079415 403.526288  L 811.518525 390.241503  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 96.005644 451.16779  L 185.444754 385.847624  L 274.883864 395.253728  L 364.322974 398.911657  L 453.762084 382.189695  L 543.201194 401.393823  L 632.640304 395.136152  L 722.079415 381.275213  L 811.518525 395.515009  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="md7efaf3aec" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #1f77b4" />
     </defs>
     <g clip-path="url(#p620c7d392f)">
      <use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="185.444754" y="370.031668" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="274.883864" y="370.596978" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="364.322974" y="386.708314" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="453.762084" y="392.220086" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="543.201194" y="399.569118" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="632.640304" y="388.969554" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="722.079415" y="403.526288" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="811.518525" y="390.241503" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="185.444754" y="385.847624" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="274.883864" y="395.253728" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="364.322974" y="398.911657" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="453.762084" y="382.189695" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="543.201194" y="401.393823" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="632.640304" y="395.136152" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="722.079415" y="381.275213" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="811.518525" y="395.515009" style="fill: #1f77b4; stroke: #1f77b4" />
     </g>
    </g>
    <g id="series--torch-eager" class="series">
-    <path d="M 96.005644 166.37873  L 185.444754 47.08418  L 274.883864 54.857193  L 364.322974 60.807081  L 453.762084 69.569387  L 543.201194 78.176231  L 632.640304 66.44605  L 722.079415 63.902153  L 811.518525 71.109857  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 96.005644 194.328898  L 185.444754 47.08418  L 274.883864 59.495011  L 364.322974 61.46768  L 453.762084 66.170732  L 543.201194 84.055394  L 632.640304 56.503348  L 722.079415 80.67181  L 811.518525 81.586292  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="m9b8c54d372" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #ff7f0e" />
     </defs>
     <g clip-path="url(#p620c7d392f)">
-     <use ns4:href="#m9b8c54d372" x="96.005644" y="166.37873" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="96.005644" y="194.328898" style="fill: #ff7f0e; stroke: #ff7f0e" />
      <use ns4:href="#m9b8c54d372" x="185.444754" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="274.883864" y="54.857193" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="364.322974" y="60.807081" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="453.762084" y="69.569387" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="543.201194" y="78.176231" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="632.640304" y="66.44605" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="722.079415" y="63.902153" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="811.518525" y="71.109857" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="274.883864" y="59.495011" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="364.322974" y="61.46768" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="453.762084" y="66.170732" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="543.201194" y="84.055394" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="632.640304" y="56.503348" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="722.079415" y="80.67181" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="811.518525" y="81.586292" style="fill: #ff7f0e; stroke: #ff7f0e" />
     </g>
    </g>
    <g id="patch_3">
@@ -4150,30 +4163,30 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
    <g id="patch_6">
     <path d="M 60.23 26.88  L 847.294169 26.88  " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
    </g>
-   <g id="text_16">
+   <g id="text_17">
     <text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="453.762084" y="20.88" transform="rotate(-0 453.762084 20.88)">Attention Implementation Latency</text>
    </g>
    <g id="legend" class="legend">
     <g id="patch_7">
-     <path d="M 720.811356 466.37197  L 840.294169 466.37197  Q 842.294169 466.37197 842.294169 464.37197  L 842.294169 435.45947  Q 842.294169 433.45947 840.294169 433.45947  L 720.811356 433.45947  Q 718.811356 433.45947 718.811356 435.45947  L 718.811356 464.37197  Q 718.811356 466.37197 720.811356 466.37197  L 720.811356 466.37197  z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
+     <path d="M 720.811356 64.7925  L 840.294169 64.7925  Q 842.294169 64.7925 842.294169 62.7925  L 842.294169 33.88  Q 842.294169 31.88 840.294169 31.88  L 720.811356 31.88  Q 718.811356 31.88 718.811356 33.88  L 718.811356 62.7925  Q 718.811356 64.7925 720.811356 64.7925  L 720.811356 64.7925  z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
     </g>
-    <g id="line2d_16">
-     <path d="M 722.811356 441.557908  L 732.811356 441.557908  L 742.811356 441.557908  " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
+    <g id="line2d_17">
+     <path d="M 722.811356 39.978438  L 732.811356 39.978438  L 742.811356 39.978438  " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
      <g>
-      <use ns4:href="#md7efaf3aec" x="732.811356" y="441.557908" style="fill: #1f77b4; stroke: #1f77b4" />
+      <use ns4:href="#md7efaf3aec" x="732.811356" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
      </g>
     </g>
     <g id="legend-label--hf-kernels-swiglu" class="legend">
-     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="445.057908" transform="rotate(-0 750.811356 445.057908)">hf_kernels_swiglu</text>
+     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="43.478438" transform="rotate(-0 750.811356 43.478438)">hf_kernels_swiglu</text>
     </g>
-    <g id="line2d_17">
-     <path d="M 722.811356 456.514158  L 732.811356 456.514158  L 742.811356 456.514158  " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
+    <g id="line2d_18">
+     <path d="M 722.811356 54.934687  L 732.811356 54.934687  L 742.811356 54.934687  " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
      <g>
-      <use ns4:href="#m9b8c54d372" x="732.811356" y="456.514158" style="fill: #ff7f0e; stroke: #ff7f0e" />
+      <use ns4:href="#m9b8c54d372" x="732.811356" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
      </g>
     </g>
     <g id="legend-label--torch-eager" class="legend">
-     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="460.014158" transform="rotate(-0 750.811356 460.014158)">torch_eager</text>
+     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="58.434687" transform="rotate(-0 750.811356 58.434687)">torch_eager</text>
     </g>
    </g>
   </g>
@@ -4193,7 +4206,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
 <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
 </span> | 
-Cell: combine | 4.28s
+Cell: combine | 4.24s
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
 <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -4319,7 +4332,7 @@ Implementations included:
 <div class="uv-install-logs" id="uv-logs-combine">
 <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 <div class="uv-logs-content" style="display: none;">
-Installed 37 packages in 195ms
+Installed 37 packages in 218ms
 </div>
 </div>
 <div class="cell-artifacts">
@@ -4332,7 +4345,7 @@ Installed 37 packages in 195ms
   <rdf:RDF>
    <ns2:Work>
     <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
-    <dc:date>2025-10-28T14:09:13.211569</dc:date>
+    <dc:date>2025-10-29T14:27:49.999657</dc:date>
     <dc:format>image/svg+xml</dc:format>
     <dc:creator>
      <ns2:Agent>
@@ -4481,83 +4494,96 @@ Installed 37 packages in 195ms
    <g id="matplotlib.axis_2">
     <g id="ytick_1">
      <g id="grid-y--2" class="grid grid-y">
-      <path d="M 60.23 416.825206  L 847.294169 416.825206  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 60.23 428.188156  L 847.294169 428.188156  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_10">
       <defs>
        <path id="m0fca2865ba" d="M 0 0  L -3.5 0  " style="stroke: #000000; stroke-width: 0.8" />
       </defs>
       <g>
-       <use ns4:href="#m0fca2865ba" x="60.23" y="416.825206" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="60.23" y="428.188156" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_10">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="420.624425" transform="rotate(-0 53.23 420.624425)">0.025</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="431.987375" transform="rotate(-0 53.23 431.987375)">0.025</text>
      </g>
     </g>
     <g id="ytick_2">
      <g id="grid-y--3" class="grid grid-y">
-      <path d="M 60.23 346.161452  L 847.294169 346.161452  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 60.23 362.86799  L 847.294169 362.86799  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_11">
       <g>
-       <use ns4:href="#m0fca2865ba" x="60.23" y="346.161452" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="60.23" y="362.86799" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_11">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="349.960671" transform="rotate(-0 53.23 349.960671)">0.030</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="366.667209" transform="rotate(-0 53.23 366.667209)">0.030</text>
      </g>
     </g>
     <g id="ytick_3">
      <g id="grid-y--4" class="grid grid-y">
-      <path d="M 60.23 275.497698  L 847.294169 275.497698  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 60.23 297.547824  L 847.294169 297.547824  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_12">
       <g>
-       <use ns4:href="#m0fca2865ba" x="60.23" y="275.497698" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="60.23" y="297.547824" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_12">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="279.296917" transform="rotate(-0 53.23 279.296917)">0.035</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="301.347043" transform="rotate(-0 53.23 301.347043)">0.035</text>
      </g>
     </g>
     <g id="ytick_4">
      <g id="grid-y--5" class="grid grid-y">
-      <path d="M 60.23 204.833944  L 847.294169 204.833944  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 60.23 232.227658  L 847.294169 232.227658  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_13">
       <g>
-       <use ns4:href="#m0fca2865ba" x="60.23" y="204.833944" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="60.23" y="232.227658" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_13">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="208.633163" transform="rotate(-0 53.23 208.633163)">0.040</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="236.026877" transform="rotate(-0 53.23 236.026877)">0.040</text>
      </g>
     </g>
     <g id="ytick_5">
      <g id="grid-y--6" class="grid grid-y">
-      <path d="M 60.23 134.170191  L 847.294169 134.170191  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 60.23 166.907492  L 847.294169 166.907492  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_14">
       <g>
-       <use ns4:href="#m0fca2865ba" x="60.23" y="134.170191" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="60.23" y="166.907492" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_14">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="137.969409" transform="rotate(-0 53.23 137.969409)">0.045</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="170.706711" transform="rotate(-0 53.23 170.706711)">0.045</text>
      </g>
     </g>
     <g id="ytick_6">
      <g id="grid-y--7" class="grid grid-y">
-      <path d="M 60.23 63.506437  L 847.294169 63.506437  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 60.23 101.587327  L 847.294169 101.587327  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_15">
       <g>
-       <use ns4:href="#m0fca2865ba" x="60.23" y="63.506437" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="60.23" y="101.587327" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_15">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="67.305655" transform="rotate(-0 53.23 67.305655)">0.050</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="105.386545" transform="rotate(-0 53.23 105.386545)">0.050</text>
+     </g>
+    </g>
+    <g id="ytick_7">
+     <g id="grid-y--8" class="grid grid-y">
+      <path d="M 60.23 36.267161  L 847.294169 36.267161  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_16">
+      <g>
+       <use ns4:href="#m0fca2865ba" x="60.23" y="36.267161" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_16">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="40.066379" transform="rotate(-0 53.23 40.066379)">0.055</text>
      </g>
     </g>
     <g id="label--y" class="ylabel">
@@ -4565,37 +4591,37 @@ Installed 37 packages in 195ms
     </g>
    </g>
    <g id="series--hf-kernels-swiglu" class="series">
-    <path d="M 96.005644 451.16779  L 185.444754 370.031668  L 274.883864 370.596978  L 364.322974 386.708314  L 453.762084 392.220086  L 543.201194 399.569118  L 632.640304 388.969554  L 722.079415 403.526288  L 811.518525 390.241503  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 96.005644 451.16779  L 185.444754 385.847624  L 274.883864 395.253728  L 364.322974 398.911657  L 453.762084 382.189695  L 543.201194 401.393823  L 632.640304 395.136152  L 722.079415 381.275213  L 811.518525 395.515009  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="md7efaf3aec" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #1f77b4" />
     </defs>
     <g clip-path="url(#p620c7d392f)">
      <use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="185.444754" y="370.031668" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="274.883864" y="370.596978" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="364.322974" y="386.708314" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="453.762084" y="392.220086" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="543.201194" y="399.569118" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="632.640304" y="388.969554" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="722.079415" y="403.526288" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="811.518525" y="390.241503" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="185.444754" y="385.847624" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="274.883864" y="395.253728" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="364.322974" y="398.911657" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="453.762084" y="382.189695" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="543.201194" y="401.393823" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="632.640304" y="395.136152" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="722.079415" y="381.275213" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="811.518525" y="395.515009" style="fill: #1f77b4; stroke: #1f77b4" />
     </g>
    </g>
    <g id="series--torch-eager" class="series">
-    <path d="M 96.005644 166.37873  L 185.444754 47.08418  L 274.883864 54.857193  L 364.322974 60.807081  L 453.762084 69.569387  L 543.201194 78.176231  L 632.640304 66.44605  L 722.079415 63.902153  L 811.518525 71.109857  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 96.005644 194.328898  L 185.444754 47.08418  L 274.883864 59.495011  L 364.322974 61.46768  L 453.762084 66.170732  L 543.201194 84.055394  L 632.640304 56.503348  L 722.079415 80.67181  L 811.518525 81.586292  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="m9b8c54d372" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #ff7f0e" />
     </defs>
     <g clip-path="url(#p620c7d392f)">
-     <use ns4:href="#m9b8c54d372" x="96.005644" y="166.37873" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="96.005644" y="194.328898" style="fill: #ff7f0e; stroke: #ff7f0e" />
      <use ns4:href="#m9b8c54d372" x="185.444754" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="274.883864" y="54.857193" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="364.322974" y="60.807081" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="453.762084" y="69.569387" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="543.201194" y="78.176231" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="632.640304" y="66.44605" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="722.079415" y="63.902153" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="811.518525" y="71.109857" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="274.883864" y="59.495011" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="364.322974" y="61.46768" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="453.762084" y="66.170732" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="543.201194" y="84.055394" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="632.640304" y="56.503348" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="722.079415" y="80.67181" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="811.518525" y="81.586292" style="fill: #ff7f0e; stroke: #ff7f0e" />
     </g>
    </g>
    <g id="patch_3">
@@ -4610,30 +4636,30 @@ Installed 37 packages in 195ms
    <g id="patch_6">
     <path d="M 60.23 26.88  L 847.294169 26.88  " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
    </g>
-   <g id="text_16">
+   <g id="text_17">
     <text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="453.762084" y="20.88" transform="rotate(-0 453.762084 20.88)">Attention Implementation Latency</text>
    </g>
    <g id="legend" class="legend">
     <g id="patch_7">
-     <path d="M 720.811356 466.37197  L 840.294169 466.37197  Q 842.294169 466.37197 842.294169 464.37197  L 842.294169 435.45947  Q 842.294169 433.45947 840.294169 433.45947  L 720.811356 433.45947  Q 718.811356 433.45947 718.811356 435.45947  L 718.811356 464.37197  Q 718.811356 466.37197 720.811356 466.37197  L 720.811356 466.37197  z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
+     <path d="M 720.811356 64.7925  L 840.294169 64.7925  Q 842.294169 64.7925 842.294169 62.7925  L 842.294169 33.88  Q 842.294169 31.88 840.294169 31.88  L 720.811356 31.88  Q 718.811356 31.88 718.811356 33.88  L 718.811356 62.7925  Q 718.811356 64.7925 720.811356 64.7925  L 720.811356 64.7925  z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
     </g>
-    <g id="line2d_16">
-     <path d="M 722.811356 441.557908  L 732.811356 441.557908  L 742.811356 441.557908  " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
+    <g id="line2d_17">
+     <path d="M 722.811356 39.978438  L 732.811356 39.978438  L 742.811356 39.978438  " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
      <g>
-      <use ns4:href="#md7efaf3aec" x="732.811356" y="441.557908" style="fill: #1f77b4; stroke: #1f77b4" />
+      <use ns4:href="#md7efaf3aec" x="732.811356" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
      </g>
     </g>
     <g id="legend-label--hf-kernels-swiglu" class="legend">
-     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="445.057908" transform="rotate(-0 750.811356 445.057908)">hf_kernels_swiglu</text>
+     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="43.478438" transform="rotate(-0 750.811356 43.478438)">hf_kernels_swiglu</text>
     </g>
-    <g id="line2d_17">
-     <path d="M 722.811356 456.514158  L 732.811356 456.514158  L 742.811356 456.514158  " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
+    <g id="line2d_18">
+     <path d="M 722.811356 54.934687  L 732.811356 54.934687  L 742.811356 54.934687  " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
      <g>
-      <use ns4:href="#m9b8c54d372" x="732.811356" y="456.514158" style="fill: #ff7f0e; stroke: #ff7f0e" />
+      <use ns4:href="#m9b8c54d372" x="732.811356" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
      </g>
     </g>
     <g id="legend-label--torch-eager" class="legend">
-     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="460.014158" transform="rotate(-0 750.811356 460.014158)">torch_eager</text>
+     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="58.434687" transform="rotate(-0 750.811356 58.434687)">torch_eager</text>
     </g>
    </g>
   </g>
diff --git a/causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl b/causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl
index 062646d5a3f22298019a79ab8e52f52ea42bd834..3c3e9cb1937f70bc8a6005f64424ae1ae23f373f 100644
--- a/causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl
+++ b/causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl
@@ -1,24 +1,24 @@
-{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S128_W2", "batch": 2, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.06712200001857127, "p50": 0.06883200001084333, "p90": 0.06976199995278876, "mean": 0.06901199997173535, "iqr": 0.0014600000213249587, "raw_times": [0.06976199995278876, 0.07104199994500959, 0.06712200001857127, 0.0683019999314638, 0.06883200001084333], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0738530000035098, "peak_bytes": 295936, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S128_W4", "batch": 2, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08455299996512622, "p50": 0.08599400007369695, "p90": 0.0868530000843748, "mean": 0.08612520005044644, "iqr": 0.0014299999975264654, "raw_times": [0.08780300004218589, 0.08455299996512622, 0.0868530000843748, 0.08542300008684833, 0.08599400007369695], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08941300006881647, "peak_bytes": 296448, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S512_W2", "batch": 2, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08494299993344612, "p50": 0.08714299997336639, "p90": 0.08724299993900786, "mean": 0.086546999955317, "iqr": 0.0020200000108161476, "raw_times": [0.08522299992819171, 0.08714299997336639, 0.08818300000257295, 0.08724299993900786, 0.08494299993344612], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09105300000555872, "peak_bytes": 1180672, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S512_W4", "batch": 2, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08327299997290538, "p50": 0.084122999965075, "p90": 0.08580299993354856, "mean": 0.08452299998680246, "iqr": 0.0023699999474047218, "raw_times": [0.08327299997290538, 0.084122999965075, 0.08598300007633952, 0.08580299993354856, 0.08343299998614384], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08891300001323543, "peak_bytes": 1181184, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S2048_W2", "batch": 2, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08298299997022696, "p50": 0.08508299993081891, "p90": 0.08600299997851835, "mean": 0.0849267999683434, "iqr": 0.0016210000239880173, "raw_times": [0.08298299997022696, 0.08508299993081891, 0.08600299997851835, 0.08438199995453033, 0.08618300000762247], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08780300004218589, "peak_bytes": 4719616, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S2048_W4", "batch": 2, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08270299997548136, "p50": 0.08315299999139825, "p90": 0.0846430000365217, "mean": 0.08407499999520951, "iqr": 0.0019010000187336118, "raw_times": [0.08315299999139825, 0.08713399995485815, 0.08270299997548136, 0.08274200001778809, 0.0846430000365217], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08981299993138236, "peak_bytes": 4720128, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S128_W2", "batch": 2, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08372299998882227, "p50": 0.08510199995725998, "p90": 0.08608299992829416, "mean": 0.08701479998762807, "iqr": 0.0011499998890940333, "raw_times": [0.08493300003920012, 0.09523300002456381, 0.08510199995725998, 0.08372299998882227, 0.08608299992829416], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08923300003971235, "peak_bytes": 9461760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S128_W4", "batch": 2, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08285199999136239, "p50": 0.08483300007355865, "p90": 0.08511300006830425, "mean": 0.08449480001218035, "iqr": 0.0016500000583619112, "raw_times": [0.08285199999136239, 0.08346300000994233, 0.08483300007355865, 0.08621299991773412, 0.08511300006830425], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08870299996033282, "peak_bytes": 9478144, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S512_W2", "batch": 2, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08278300003894401, "p50": 0.08427300008406746, "p90": 0.08444299999155191, "mean": 0.08422300002166594, "iqr": 0.0002599999788799323, "raw_times": [0.08444299999155191, 0.08418300001267198, 0.08278300003894401, 0.08543299998109433, 0.08427300008406746], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08903299999474257, "peak_bytes": 37773312, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S512_W4", "batch": 2, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08352199995442788, "p50": 0.0842329999386493, "p90": 0.08553300006042264, "mean": 0.08496079999531503, "iqr": 0.0014400000054592965, "raw_times": [0.08409300005496334, 0.08742299996811198, 0.08553300006042264, 0.08352199995442788, 0.0842329999386493], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08985400006622513, "peak_bytes": 37789696, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S2048_W2", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.14414499992199126, "p50": 0.14512600000671227, "p90": 0.14515400005166157, "mean": 0.1465472000063528, "iqr": 0.0008580000212532468, "raw_times": [0.14512600000671227, 0.14414499992199126, 0.14429600003040832, 0.15401500002099056, 0.14515400005166157], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.14571500003057736, "peak_bytes": 151019520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S2048_W4", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.16020600003230356, "p50": 0.16135600003508443, "p90": 0.16139600006681576, "mean": 0.16140360005465482, "iqr": 0.00029099999210302485, "raw_times": [0.16139600006681576, 0.1629550000643576, 0.16110500007471273, 0.16020600003230356, 0.16135600003508443], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.1623660000404925, "peak_bytes": 151035904, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S128_W2", "batch": 4, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07979300005445111, "p50": 0.08039299996198679, "p90": 0.08136300004935038, "mean": 0.08070500002759218, "iqr": 0.001150000002780871, "raw_times": [0.0802130000465695, 0.0817630000256031, 0.07979300005445111, 0.08039299996198679, 0.08136300004935038], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0855329999467358, "peak_bytes": 33727488, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S128_W4", "batch": 4, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0800829999434427, "p50": 0.08147299990923784, "p90": 0.08197300007850572, "mean": 0.08146099996793055, "iqr": 0.00109000018255756, "raw_times": [0.0800829999434427, 0.08197300007850572, 0.08147299990923784, 0.08289300001251831, 0.08088299989594816], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08291199992527254, "peak_bytes": 591360, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S512_W2", "batch": 4, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0799729999698684, "p50": 0.08137199995417177, "p90": 0.081513000054656, "mean": 0.08127659998535819, "iqr": 0.0006500000608866685, "raw_times": [0.0799729999698684, 0.08266199995432544, 0.081513000054656, 0.08086299999376934, 0.08137199995417177], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08939400004237541, "peak_bytes": 2360320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S512_W4", "batch": 4, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08067300007041922, "p50": 0.08162300002823031, "p90": 0.08189199991193163, "mean": 0.08365860001049441, "iqr": 0.0008099999604382901, "raw_times": [0.08067300007041922, 0.08108199995149334, 0.08189199991193163, 0.08162300002823031, 0.09302300009039755], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08415299998887349, "peak_bytes": 2360832, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S2048_W2", "batch": 4, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0806030000148894, "p50": 0.08186299999124458, "p90": 0.08199299998068454, "mean": 0.08162900001025264, "iqr": 0.001009999891721236, "raw_times": [0.08270299997548136, 0.08186299999124458, 0.0806030000148894, 0.08199299998068454, 0.08098300008896331], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.10199300004387624, "peak_bytes": 9438208, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S2048_W4", "batch": 4, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08040199998049502, "p50": 0.08168299996214046, "p90": 0.08185199999388715, "mean": 0.08171659999334224, "iqr": 0.0013889999763705418, "raw_times": [0.0804630000175166, 0.08418300001267198, 0.08168299996214046, 0.08040199998049502, 0.08185199999388715], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08522300004187855, "peak_bytes": 9438720, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S128_W2", "batch": 4, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08097300008103048, "p50": 0.08150300004672317, "p90": 0.08173299988811777, "mean": 0.08153900000706926, "iqr": 0.0005599998758043512, "raw_times": [0.08117300001231342, 0.08231300000716146, 0.08150300004672317, 0.08173299988811777, 0.08097300008103048], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08440300007350743, "peak_bytes": 18931712, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S128_W4", "batch": 4, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0802130000465695, "p50": 0.08124300006784324, "p90": 0.08242299998073577, "mean": 0.08162480000919459, "iqr": 0.0012000000424450263, "raw_times": [0.0802130000465695, 0.08302200001253368, 0.08242299998073577, 0.08124300006784324, 0.08122299993829074], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08460300000479037, "peak_bytes": 18948096, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S512_W2", "batch": 4, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09226300005593657, "p50": 0.09320300000581483, "p90": 0.0934630000983816, "mean": 0.09316100004070904, "iqr": 0.0007800000503266347, "raw_times": [0.09419299999535724, 0.09320300000581483, 0.0934630000983816, 0.09226300005593657, 0.09268300004805496], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0951240000404141, "peak_bytes": 75522048, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S512_W4", "batch": 4, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09887299995625654, "p50": 0.09917300008055463, "p90": 0.09990300009121711, "mean": 0.09939520000443736, "iqr": 0.0009100001534534385, "raw_times": [0.09887299995625654, 0.09917300008055463, 0.09990300009121711, 0.10003399995639484, 0.09899299993776367], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.1023739999936879, "peak_bytes": 75538432, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S2048_W2", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.4842959999677987, "p50": 0.4860569999891595, "p90": 0.4878769999550059, "mean": 0.48646659997757524, "iqr": 0.002959999960694404, "raw_times": [0.4849169999943115, 0.4860569999891595, 0.4878769999550059, 0.4842959999677987, 0.4891859999816006], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.4877669999814316, "peak_bytes": 302014464, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S2048_W4", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.4968179999877975, "p50": 0.49805800006197387, "p90": 0.4990780000753148, "mean": 0.4983496000022569, "iqr": 0.001141000097959477, "raw_times": [0.4979369999773553, 0.49985699990884314, 0.4990780000753148, 0.49805800006197387, 0.4968179999877975], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.49727700002222264, "peak_bytes": 302030848, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:35Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S128_W2", "batch": 2, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.047051000024112, "p50": 0.048391000007086404, "p90": 0.048571999968771706, "mean": 0.04890720000503279, "iqr": 0.0005509999709829572, "raw_times": [0.0525010000274051, 0.048571999968771706, 0.04802099999778875, 0.048391000007086404, 0.047051000024112], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.058991999992485944, "peak_bytes": 295936, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S128_W4", "batch": 2, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05480199996554802, "p50": 0.05610099998420992, "p90": 0.05628099995647062, "mean": 0.056069199968078465, "iqr": 0.0006299999881775875, "raw_times": [0.057510999965870724, 0.05628099995647062, 0.05610099998420992, 0.055650999968293036, 0.05480199996554802], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.06025200002568454, "peak_bytes": 296448, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S512_W2", "batch": 2, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05469199999197372, "p50": 0.056971999981669796, "p90": 0.057361000017408514, "mean": 0.056363600003805914, "iqr": 0.0025490000439276628, "raw_times": [0.05469199999197372, 0.057361000017408514, 0.056971999981669796, 0.05798100005449669, 0.05481199997348085], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0596809999819925, "peak_bytes": 1180672, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S512_W4", "batch": 2, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05416100003685642, "p50": 0.05502099998011545, "p90": 0.05511200004093553, "mean": 0.05489540000098714, "iqr": 0.00016000007008187822, "raw_times": [0.05416100003685642, 0.05495199997085365, 0.055230999976174644, 0.05511200004093553, 0.05502099998011545], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05659100003185813, "peak_bytes": 1181184, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S2048_W2", "batch": 2, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052401000004920206, "p50": 0.055000999964249786, "p90": 0.056301000029179704, "mean": 0.054810999995424936, "iqr": 0.0023400000372930663, "raw_times": [0.052401000004920206, 0.056301000029179704, 0.056390999986888346, 0.05396099999188664, 0.055000999964249786], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05838100003074942, "peak_bytes": 4719616, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S2048_W4", "batch": 2, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05270100001553146, "p50": 0.05342100001826111, "p90": 0.054111999986616865, "mean": 0.053611199996339565, "iqr": 0.0008709999974598759, "raw_times": [0.05324099998915699, 0.054580999972131394, 0.054111999986616865, 0.05342100001826111, 0.05270100001553146], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0580610000042725, "peak_bytes": 4720128, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S128_W2", "batch": 2, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.051810999991630524, "p50": 0.05310099999178419, "p90": 0.053301000036753976, "mean": 0.05577720000928821, "iqr": 0.0007700000423938036, "raw_times": [0.051810999991630524, 0.05253099999436017, 0.06814200003191218, 0.053301000036753976, 0.05310099999178419], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056801999960498506, "peak_bytes": 9461760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S128_W4", "batch": 2, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.053270999956112064, "p50": 0.05397199998924407, "p90": 0.05399100001568513, "mean": 0.054061200000887766, "iqr": 0.00023000001192485797, "raw_times": [0.05399100001568513, 0.05531100003963729, 0.053270999956112064, 0.05397199998924407, 0.05376100000376027], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056641000014678866, "peak_bytes": 9478144, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S512_W2", "batch": 2, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052231000040592335, "p50": 0.052561000018158666, "p90": 0.0526110000009794, "mean": 0.0529970000116009, "iqr": 0.0002200000039920269, "raw_times": [0.052231000040592335, 0.052390999996987375, 0.05519100000128674, 0.052561000018158666, 0.0526110000009794], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05621100001462764, "peak_bytes": 37773312, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S512_W4", "batch": 2, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052561000018158666, "p50": 0.053772000001117704, "p90": 0.05471100001841478, "mean": 0.057879400003457704, "iqr": 0.0021000000174353772, "raw_times": [0.0526110000009794, 0.05471100001841478, 0.053772000001117704, 0.052561000018158666, 0.07574199997861797], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.06976200000963217, "peak_bytes": 37789696, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S2048_W2", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.051341000016691396, "p50": 0.05308099997591853, "p90": 0.05349200000637211, "mean": 0.05272739998645193, "iqr": 0.0014310000437944836, "raw_times": [0.052060999962577625, 0.05366199997069998, 0.05308099997591853, 0.051341000016691396, 0.05349200000637211], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05661099999088037, "peak_bytes": 151019520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S2048_W4", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05081099999415528, "p50": 0.053202000003693684, "p90": 0.05362099994954406, "mean": 0.05282339998302632, "iqr": 0.0011499999459374521, "raw_times": [0.05081099999415528, 0.052471000003606605, 0.05362099994954406, 0.054011999964131974, 0.053202000003693684], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05642200000011144, "peak_bytes": 151035904, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S128_W2", "batch": 4, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052221000032659504, "p50": 0.05397099999981947, "p90": 0.05448100000648992, "mean": 0.05380100001275423, "iqr": 0.001479999980347202, "raw_times": [0.052221000032659504, 0.05300100002614272, 0.055330999998659536, 0.05448100000648992, 0.05397099999981947], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.058421999995061924, "peak_bytes": 33727488, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S128_W4", "batch": 4, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05164199995988383, "p50": 0.053460999993149017, "p90": 0.05421100001967716, "mean": 0.0532791999944493, "iqr": 0.00139000002263856, "raw_times": [0.0528209999970386, 0.054261000002497894, 0.05421100001967716, 0.05164199995988383, 0.053460999993149017], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05741199998965385, "peak_bytes": 591360, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S512_W2", "batch": 4, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052460999995673774, "p50": 0.0557109999590466, "p90": 0.05585100001326282, "mean": 0.05600519999688913, "iqr": 0.002330000029360235, "raw_times": [0.052460999995673774, 0.0557109999590466, 0.05585100001326282, 0.053520999983902584, 0.06248200003255988], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.058421000005637325, "peak_bytes": 2360320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S512_W4", "batch": 4, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0540510000064387, "p50": 0.0541219999945497, "p90": 0.05425200004083308, "mean": 0.054337400013082515, "iqr": 0.0001910000264615519, "raw_times": [0.0540510000064387, 0.05425200004083308, 0.05520100000921957, 0.0541219999945497, 0.05406100001437153], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05772200000819794, "peak_bytes": 2360832, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S2048_W2", "batch": 4, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05302099998516496, "p50": 0.05408099997339377, "p90": 0.0544409999747586, "mean": 0.05571119997966889, "iqr": 0.0008299999763039523, "raw_times": [0.05302099998516496, 0.0544409999747586, 0.06340199996657248, 0.053610999998454645, 0.05408099997339377], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05787100002407897, "peak_bytes": 9438208, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S2048_W4", "batch": 4, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052391999986411975, "p50": 0.05292200000894809, "p90": 0.053131000015582686, "mean": 0.05318180001268047, "iqr": 0.00026899999738816405, "raw_times": [0.053131000015582686, 0.054602000034265075, 0.052391999986411975, 0.05286200001819452, 0.05292200000894809], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05755099999760205, "peak_bytes": 9438720, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S128_W2", "batch": 4, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05195099998900332, "p50": 0.05479100002503401, "p90": 0.05620100000669481, "mean": 0.05852919999824735, "iqr": 0.0034000000255218765, "raw_times": [0.07690199998933167, 0.05620100000669481, 0.05479100002503401, 0.05195099998900332, 0.052800999981172936], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.057030999982998765, "peak_bytes": 18931712, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S128_W4", "batch": 4, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05290100000365783, "p50": 0.05457100002104198, "p90": 0.055251000048883725, "mean": 0.055353400011881604, "iqr": 0.001779000058377278, "raw_times": [0.05290100000365783, 0.06057199999531804, 0.055251000048883725, 0.05347199999050645, 0.05457100002104198], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056300999972336285, "peak_bytes": 18948096, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S512_W2", "batch": 4, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052152000023397704, "p50": 0.05461199998535449, "p90": 0.05471200000783938, "mean": 0.05381760001910152, "iqr": 0.001900999961890193, "raw_times": [0.052152000023397704, 0.05461199998535449, 0.05480100003296684, 0.05471200000783938, 0.052811000045949186], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05758200001082514, "peak_bytes": 75522048, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S512_W4", "batch": 4, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05339099999446262, "p50": 0.05463100001179555, "p90": 0.055171999974845676, "mean": 0.05451339999353877, "iqr": 0.0011309999763398082, "raw_times": [0.05404099999850587, 0.055331999988084135, 0.05339099999446262, 0.05463100001179555, 0.055171999974845676], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.058501000012256554, "peak_bytes": 75538432, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S2048_W2", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05277100001421786, "p50": 0.053860999969401746, "p90": 0.054361000024982786, "mean": 0.053951200004576094, "iqr": 0.0007190000133050489, "raw_times": [0.05277100001421786, 0.05512100000260034, 0.05364200001167774, 0.053860999969401746, 0.054361000024982786], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.057851999997637904, "peak_bytes": 302014464, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S2048_W4", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05124100005104992, "p50": 0.053691000005073874, "p90": 0.054261999991922494, "mean": 0.05327740001348502, "iqr": 0.0014609999539061391, "raw_times": [0.05124100005104992, 0.05439199998136246, 0.054261999991922494, 0.053691000005073874, 0.052801000038016355], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05714199994599767, "peak_bytes": 302030848, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
diff --git a/causal_conv1d/impls/cells/benchmark.py b/causal_conv1d/impls/cells/benchmark.py
index 2e38669a505cbdf181a93e97f31ed1e67ecf4883..725b12c4018e4eec05c5ddccb0c88a8eae6f150d 100644
--- a/causal_conv1d/impls/cells/benchmark.py
+++ b/causal_conv1d/impls/cells/benchmark.py
@@ -4,37 +4,28 @@
 #     "numpy",
 #     "torch==2.8.0",
 #     "kernels-benchmark-tools",
+#     "kernels",
 # ]
 #
 # [tool.uv.sources]
 # kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
 # ///
 import torch
-import torch.nn.functional as F
 import sys
 from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+from kernels import get_kernel
 
+# Load the causal conv1d kernel
+causal_conv1d = get_kernel("kernels-community/causal-conv1d")
 
-def torch_causal_conv1d(input_tensor, weight, bias):
-    # Convert to weight dtype for computation
-    x = input_tensor.to(weight.dtype)
-    dim = weight.shape[0]
-    width = weight.shape[1]
-    seqlen = input_tensor.shape[-1]
 
-    # Depthwise causal conv1d using PyTorch
-    out = F.conv1d(x, weight.unsqueeze(1), bias, padding=width - 1, groups=dim)
-
-    # Truncate to original sequence length
-    out = out[..., :seqlen]
-
-    # Convert back to original dtype
-    return out.to(input_tensor.dtype)
+def hf_kernels_causal_conv1d(input_tensor, weight, bias):
+    return causal_conv1d.causal_conv1d_fn(input_tensor, weight, bias)
 
 
 run_benchmark(
     kernel_type=KernelTypeEnum.CAUSAL_CONV1D,
-    impl_name="torch_eager",
-    impl_tags={"family": "pytorch", "backend": "eager"},
-    impl_func=torch_causal_conv1d,
+    impl_name="hf_kernels_causal_conv1d",
+    impl_tags={"family": "hf-kernels", "backend": "cuda"},
+    impl_func=hf_kernels_causal_conv1d,
 )
\ No newline at end of file
diff --git a/causal_conv1d/impls/hf_kernels_causal_conv1d.html b/causal_conv1d/impls/hf_kernels_causal_conv1d.html
index e50cedeff51b83afce46864a23939e763973b082..025d1f7d39597f6702f2ef95b801eca2a6d706e8 100644
--- a/causal_conv1d/impls/hf_kernels_causal_conv1d.html
+++ b/causal_conv1d/impls/hf_kernels_causal_conv1d.html
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
 <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> | 
-Cell: nv | 0.21s
+Cell: nv | 0.24s
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
 <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3887,7 +3887,7 @@ Cell: nv | 0.21s
 </div>
 </div>
 <div id="output-nv" class="cell-output">
-<div class="cell-stdout"><pre class="stdout-text">Tue Oct 28 14:08:09 2025       
+<div class="cell-stdout"><pre class="stdout-text">Wed Oct 29 14:27:09 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -3896,7 +3896,7 @@ Cell: nv | 0.21s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   28C    P0             80W /  350W |       0MiB /  46068MiB |     19%      Default |
+| N/A   33C    P0            109W /  350W |       0MiB /  46068MiB |    100%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3920,7 +3920,7 @@ Cell: nv | 0.21s
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
 </span> | 
-Cell: benchmark | 9.91s
+Cell: benchmark | 5.79s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3973,19 +3973,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     153.312us      3772.44%     153.312us     153.312us             1  
-                               hf_kernels_causal_conv1d         8.26%     153.696us        99.59%       1.854ms       1.854ms       0.000us         0.00%       5.504us       5.504us             1  
-                                         CausalConv1dFn         6.06%     112.844us        91.33%       1.700ms     566.616us       0.000us         0.00%       5.504us       1.835us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.41%      26.281us        81.37%       1.514ms     504.821us       4.064us       100.00%       5.504us       1.835us             3  
-void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.064us       100.00%       4.064us       1.355us             3  
-                                Activity Buffer Request        77.27%       1.438ms        77.27%       1.438ms       1.438ms       1.440us        35.43%       1.440us       1.440us             1  
-                                       aten::empty_like         1.15%      21.339us         3.90%      72.543us      24.181us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         2.75%      51.204us         2.75%      51.204us      17.068us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         2.69%      50.001us         2.69%      50.001us      16.667us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.41%       7.700us         0.41%       7.700us       7.700us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     151.393us      3724.31%     151.393us     151.393us             1  
+                               hf_kernels_causal_conv1d         8.95%     166.324us        99.62%       1.852ms       1.852ms       0.000us         0.00%       5.505us       5.505us             1  
+                                         CausalConv1dFn         6.05%     112.563us        90.67%       1.686ms     561.934us       0.000us         0.00%       5.505us       1.835us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.41%      26.172us        80.97%       1.505ms     501.826us       4.065us       100.00%       5.505us       1.835us             3  
+void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.065us       100.00%       4.065us       1.355us             3  
+                                Activity Buffer Request        77.14%       1.434ms        77.14%       1.434ms       1.434ms       1.440us        35.42%       1.440us       1.440us             1  
+                                       aten::empty_like         1.03%      19.059us         3.64%      67.761us      22.587us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         2.62%      48.702us         2.62%      48.702us      16.234us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         2.42%      45.061us         2.42%      45.061us      15.020us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.38%       7.150us         0.38%       7.150us       7.150us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.861ms
-Self CUDA time total: 4.064us
+Self CPU time total: 1.859ms
+Self CUDA time total: 4.065us
 
 
 
@@ -3995,19 +3995,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     128.895us      3412.63%     128.895us     128.895us             1  
-                               hf_kernels_causal_conv1d         5.00%      84.832us        99.68%       1.692ms       1.692ms       0.000us         0.00%       5.026us       5.026us             1  
-                                         CausalConv1dFn         4.43%      75.123us        94.68%       1.607ms     535.685us       0.000us         0.00%       5.026us       1.675us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.59%      27.059us        88.41%       1.501ms     500.224us       3.777us       100.00%       5.026us       1.675us             3  
-void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.777us       100.00%       3.777us       1.259us             3  
-                                Activity Buffer Request        84.88%       1.441ms        84.88%       1.441ms       1.441ms       1.249us        33.07%       1.249us       1.249us             1  
-                                       aten::empty_like         0.54%       9.230us         1.84%      31.262us      10.421us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.30%      22.032us         1.30%      22.032us       7.344us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         1.94%      32.892us         1.94%      32.892us      10.964us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.32%       5.440us         0.32%       5.440us       5.440us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     129.439us      3456.32%     129.439us     129.439us             1  
+                               hf_kernels_causal_conv1d         5.79%      99.043us        99.68%       1.706ms       1.706ms       0.000us         0.00%       4.994us       4.994us             1  
+                                         CausalConv1dFn         4.71%      80.562us        93.90%       1.607ms     535.793us       0.000us         0.00%       4.994us       1.665us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.47%      25.130us        87.50%       1.498ms     499.285us       3.745us       100.00%       4.994us       1.665us             3  
+void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.745us       100.00%       3.745us       1.248us             3  
+                                Activity Buffer Request        84.17%       1.441ms        84.17%       1.441ms       1.441ms       1.249us        33.35%       1.249us       1.249us             1  
+                                       aten::empty_like         0.47%       7.980us         1.69%      28.961us       9.654us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.23%      20.981us         1.23%      20.981us       6.994us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.86%      31.821us         1.86%      31.821us      10.607us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.32%       5.430us         0.32%       5.430us       5.430us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.697ms
-Self CUDA time total: 3.777us
+Self CPU time total: 1.712ms
+Self CUDA time total: 3.745us
 
 
 
@@ -4017,19 +4017,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     124.670us      3273.90%     124.670us     124.670us             1  
-                               hf_kernels_causal_conv1d         4.86%      81.824us        99.65%       1.679ms       1.679ms       0.000us         0.00%       5.056us       5.056us             1  
-                                         CausalConv1dFn         4.28%      72.081us        94.80%       1.598ms     532.512us       0.000us         0.00%       5.056us       1.685us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.53%      25.732us        88.63%       1.494ms     497.871us       3.808us       100.00%       5.056us       1.685us             3  
-void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.808us       100.00%       3.808us       1.269us             3  
-                                Activity Buffer Request        85.15%       1.435ms        85.15%       1.435ms       1.435ms       1.248us        32.77%       1.248us       1.248us             1  
-                                       aten::empty_like         0.59%       9.910us         1.89%      31.841us      10.614us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.30%      21.931us         1.30%      21.931us       7.310us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         1.96%      32.960us         1.96%      32.960us      10.987us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.35%       5.830us         0.35%       5.830us       5.830us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     124.098us      3285.62%     124.098us     124.098us             1  
+                               hf_kernels_causal_conv1d         5.52%      95.683us        99.69%       1.728ms       1.728ms       0.000us         0.00%       5.057us       5.057us             1  
+                                         CausalConv1dFn         4.48%      77.582us        94.17%       1.632ms     544.020us       0.000us         0.00%       5.057us       1.686us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.43%      24.830us        87.99%       1.525ms     508.322us       3.777us       100.00%       5.057us       1.686us             3  
+void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.777us       100.00%       3.777us       1.259us             3  
+                                Activity Buffer Request        84.76%       1.469ms        84.76%       1.469ms       1.469ms       1.280us        33.89%       1.280us       1.280us             1  
+                                       aten::empty_like         0.46%       7.920us         1.70%      29.511us       9.837us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.25%      21.591us         1.25%      21.591us       7.197us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.80%      31.261us         1.80%      31.261us      10.420us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.31%       5.301us         0.31%       5.301us       5.301us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.685ms
-Self CUDA time total: 3.808us
+Self CPU time total: 1.733ms
+Self CUDA time total: 3.777us
 
 
 
@@ -4039,19 +4039,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     131.358us      3479.68%     131.358us     131.358us             1  
-                               hf_kernels_causal_conv1d         4.44%      83.422us        99.71%       1.875ms       1.875ms       0.000us         0.00%       5.054us       5.054us             1  
-                                         CausalConv1dFn         4.02%      75.643us        95.28%       1.792ms     597.348us       0.000us         0.00%       5.054us       1.685us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.36%      25.501us        89.54%       1.684ms     561.363us       3.775us       100.00%       5.054us       1.685us             3  
-void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.775us       100.00%       3.775us       1.258us             3  
-                                Activity Buffer Request        75.66%       1.423ms        75.66%       1.423ms       1.423ms       1.279us        33.88%       1.279us       1.279us             1  
-                                       aten::empty_like         0.55%      10.279us         1.72%      32.311us      10.770us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.17%      22.032us         1.17%      22.032us       7.344us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        12.52%     235.449us        12.52%     235.449us      78.483us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.29%       5.400us         0.29%       5.400us       5.400us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     129.729us      3378.36%     129.729us     129.729us             1  
+                               hf_kernels_causal_conv1d         5.03%      97.232us        99.72%       1.927ms       1.927ms       0.000us         0.00%       5.120us       5.120us             1  
+                                         CausalConv1dFn         4.11%      79.452us        94.69%       1.830ms     610.049us       0.000us         0.00%       5.120us       1.707us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.27%      24.481us        89.03%       1.721ms     573.588us       3.840us       100.00%       5.120us       1.707us             3  
+void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.840us       100.00%       3.840us       1.280us             3  
+                                Activity Buffer Request        76.40%       1.477ms        76.40%       1.477ms       1.477ms       1.280us        33.33%       1.280us       1.280us             1  
+                                       aten::empty_like         0.41%       7.951us         1.55%      29.931us       9.977us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.14%      21.980us         1.14%      21.980us       7.327us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        11.36%     219.575us        11.36%     219.575us      73.192us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.28%       5.490us         0.28%       5.490us       5.490us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.881ms
-Self CUDA time total: 3.775us
+Self CPU time total: 1.933ms
+Self CUDA time total: 3.840us
 
 
 
@@ -4061,19 +4061,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     129.694us      2701.96%     129.694us     129.694us             1  
-                               hf_kernels_causal_conv1d         4.57%      82.923us        99.70%       1.809ms       1.809ms       0.000us         0.00%       6.432us       6.432us             1  
-                                         CausalConv1dFn         4.25%      77.065us        95.13%       1.727ms     575.517us       0.000us         0.00%       6.432us       2.144us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.43%      25.889us        89.13%       1.618ms     539.172us       4.800us       100.00%       6.432us       2.144us             3  
-void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.800us       100.00%       4.800us       1.600us             3  
-                                Activity Buffer Request        78.67%       1.428ms        78.67%       1.428ms       1.428ms       1.632us        34.00%       1.632us       1.632us             1  
-                                       aten::empty_like         0.53%       9.690us         1.76%      31.970us      10.657us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.23%      22.280us         1.23%      22.280us       7.427us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         9.03%     163.837us         9.03%     163.837us      54.612us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.30%       5.391us         0.30%       5.391us       5.391us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     126.080us      2644.30%     126.080us     126.080us             1  
+                               hf_kernels_causal_conv1d         5.18%     102.863us        99.75%       1.979ms       1.979ms       0.000us         0.00%       6.368us       6.368us             1  
+                                         CausalConv1dFn         3.95%      78.303us        94.57%       1.876ms     625.402us       0.000us         0.00%       6.368us       2.123us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.22%      24.140us        89.14%       1.768ms     589.491us       4.768us       100.00%       6.368us       2.123us             3  
+void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.768us       100.00%       4.768us       1.589us             3  
+                                Activity Buffer Request        79.49%       1.577ms        79.49%       1.577ms       1.577ms       1.600us        33.56%       1.600us       1.600us             1  
+                                       aten::empty_like         0.40%       7.900us         1.48%      29.430us       9.810us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.09%      21.530us         1.09%      21.530us       7.177us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         8.43%     167.184us         8.43%     167.184us      55.728us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.25%       4.910us         0.25%       4.910us       4.910us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.815ms
-Self CUDA time total: 4.800us
+Self CPU time total: 1.984ms
+Self CUDA time total: 4.768us
 
 
 
@@ -4083,19 +4083,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     118.655us      2439.95%     118.655us     118.655us             1  
-                               hf_kernels_causal_conv1d        15.62%      77.102us        98.87%     488.177us     488.177us       0.000us         0.00%       6.495us       6.495us             1  
-                                         CausalConv1dFn        14.62%      72.193us        83.25%     411.075us     137.025us       0.000us         0.00%       6.495us       2.165us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.27%      26.040us        62.53%     308.751us     102.917us       4.863us       100.00%       6.495us       2.165us             3  
-void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.863us       100.00%       4.863us       1.621us             3  
-                                Activity Buffer Request        25.28%     124.815us        25.28%     124.815us     124.815us       1.632us        33.56%       1.632us       1.632us             1  
-                                       aten::empty_like         1.61%       7.949us         6.10%      30.131us      10.044us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.49%      22.182us         4.49%      22.182us       7.394us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        31.98%     157.896us        31.98%     157.896us      52.632us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.13%       5.580us         1.13%       5.580us       5.580us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     121.055us      2488.80%     121.055us     121.055us             1  
+                               hf_kernels_causal_conv1d        13.09%      78.123us        99.20%     592.205us     592.205us       0.000us         0.00%       6.528us       6.528us             1  
+                                         CausalConv1dFn        13.01%      77.643us        86.11%     514.082us     171.361us       0.000us         0.00%       6.528us       2.176us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.18%      24.929us        68.36%     408.089us     136.030us       4.864us       100.00%       6.528us       2.176us             3  
+void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.864us       100.00%       4.864us       1.621us             3  
+                                Activity Buffer Request        36.63%     218.665us        36.63%     218.665us     218.665us       1.664us        34.21%       1.664us       1.664us             1  
+                                       aten::empty_like         1.31%       7.839us         4.75%      28.350us       9.450us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.44%      20.511us         3.44%      20.511us       6.837us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        27.55%     164.495us        27.55%     164.495us      54.832us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.80%       4.790us         0.80%       4.790us       4.790us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 493.757us
-Self CUDA time total: 4.863us
+Self CPU time total: 596.995us
+Self CUDA time total: 4.864us
 
 
 
@@ -4105,19 +4105,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     126.463us      1179.69%     126.463us     126.463us             1  
-                               hf_kernels_causal_conv1d         4.44%      79.793us        99.69%       1.793ms       1.793ms       0.000us         0.00%      14.304us      14.304us             1  
-                                         CausalConv1dFn         3.96%      71.252us        95.25%       1.713ms     571.037us       0.000us         0.00%      14.304us       4.768us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.37%      24.661us        89.51%       1.610ms     536.652us      10.720us       100.00%      14.304us       4.768us             3  
-void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.720us       100.00%      10.720us       3.573us             3  
-                                Activity Buffer Request        79.30%       1.426ms        79.30%       1.426ms       1.426ms       3.584us        33.43%       3.584us       3.584us             1  
-                                       aten::empty_like         0.54%       9.750us         1.77%      31.901us      10.634us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.23%      22.151us         1.23%      22.151us       7.384us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         8.84%     159.036us         8.84%     159.036us      53.012us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.31%       5.660us         0.31%       5.660us       5.660us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     128.031us      1201.49%     128.031us     128.031us             1  
+                               hf_kernels_causal_conv1d         5.58%     105.873us        99.72%       1.893ms       1.893ms       0.000us         0.00%      14.208us      14.208us             1  
+                                         CausalConv1dFn         4.13%      78.341us        94.14%       1.787ms     595.748us       0.000us         0.00%      14.208us       4.736us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.45%      27.570us        88.49%       1.680ms     559.957us      10.656us       100.00%      14.208us       4.736us             3  
+void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.656us       100.00%      10.656us       3.552us             3  
+                                Activity Buffer Request        77.94%       1.480ms        77.94%       1.480ms       1.480ms       3.552us        33.33%       3.552us       3.552us             1  
+                                       aten::empty_like         0.41%       7.812us         1.53%      29.032us       9.677us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.12%      21.220us         1.12%      21.220us       7.073us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         9.09%     172.624us         9.09%     172.624us      57.541us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.28%       5.330us         0.28%       5.330us       5.330us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.799ms
-Self CUDA time total: 10.720us
+Self CPU time total: 1.898ms
+Self CUDA time total: 10.656us
 
 
 
@@ -4127,19 +4127,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     122.490us      1115.98%     122.490us     122.490us             1  
-                               hf_kernels_causal_conv1d        17.58%      82.141us        98.94%     462.145us     462.145us       0.000us         0.00%      14.656us      14.656us             1  
-                                         CausalConv1dFn        15.46%      72.195us        81.35%     380.004us     126.668us       0.000us         0.00%      14.656us       4.885us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.51%      25.720us        59.56%     278.229us      92.743us      10.976us       100.00%      14.656us       4.885us             3  
-void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.976us       100.00%      10.976us       3.659us             3  
-                                Activity Buffer Request        20.67%      96.553us        20.67%      96.553us      96.553us       3.680us        33.53%       3.680us       3.680us             1  
-                                       aten::empty_like         1.79%       8.340us         6.33%      29.580us       9.860us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.55%      21.240us         4.55%      21.240us       7.080us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        33.39%     155.956us        33.39%     155.956us      51.985us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.06%       4.970us         1.06%       4.970us       4.970us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     122.524us      1119.66%     122.524us     122.524us             1  
+                               hf_kernels_causal_conv1d        19.00%     100.263us        99.02%     522.563us     522.563us       0.000us         0.00%      14.623us      14.623us             1  
+                                         CausalConv1dFn        14.56%      76.813us        80.02%     422.300us     140.767us       0.000us         0.00%      14.623us       4.874us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.04%      26.621us        60.06%     316.927us     105.642us      10.943us       100.00%      14.623us       4.874us             3  
+void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.943us       100.00%      10.943us       3.648us             3  
+                                Activity Buffer Request        24.63%     129.993us        24.63%     129.993us     129.993us       3.680us        33.63%       3.680us       3.680us             1  
+                                       aten::empty_like         1.53%       8.070us         5.41%      28.560us       9.520us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.88%      20.490us         3.88%      20.490us       6.830us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        30.38%     160.313us        30.38%     160.313us      53.438us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.98%       5.160us         0.98%       5.160us       5.160us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 467.115us
-Self CUDA time total: 10.976us
+Self CPU time total: 527.723us
+Self CUDA time total: 10.943us
 
 
 
@@ -4149,18 +4149,18 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     128.671us      1165.50%     128.671us     128.671us             1  
-                               hf_kernels_causal_conv1d         4.51%      81.351us        99.72%       1.798ms       1.798ms       0.000us         0.00%      14.784us      14.784us             1  
-                                         CausalConv1dFn         4.05%      73.093us        95.21%       1.717ms     572.174us       0.000us         0.00%      14.784us       4.928us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.34%      24.081us        89.39%       1.612ms     537.183us      11.040us       100.00%      14.784us       4.928us             3  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     130.879us      1185.50%     130.879us     130.879us             1  
+                               hf_kernels_causal_conv1d         6.10%     112.423us        99.71%       1.839ms       1.839ms       0.000us         0.00%      14.752us      14.752us             1  
+                                         CausalConv1dFn         4.42%      81.553us        93.62%       1.726ms     575.457us       0.000us         0.00%      14.752us       4.917us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.34%      24.629us        87.45%       1.613ms     537.533us      11.040us       100.00%      14.752us       4.917us             3  
 void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      11.040us       100.00%      11.040us       3.680us             3  
-                                Activity Buffer Request        79.34%       1.430ms        79.34%       1.430ms       1.430ms       3.744us        33.91%       3.744us       3.744us             1  
-                                       aten::empty_like         0.49%       8.921us         1.77%      31.881us      10.627us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.27%      22.960us         1.27%      22.960us       7.653us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         8.72%     157.177us         8.72%     157.177us      52.392us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.28%       4.970us         0.28%       4.970us       4.970us       0.000us         0.00%       0.000us       0.000us             1  
+                                Activity Buffer Request        77.44%       1.428ms        77.44%       1.428ms       1.428ms       3.712us        33.62%       3.712us       3.712us             1  
+                                       aten::empty_like         0.46%       8.560us         1.75%      32.220us      10.740us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.28%      23.660us         1.28%      23.660us       7.887us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         8.67%     159.915us         8.67%     159.915us      53.305us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.29%       5.260us         0.29%       5.260us       5.260us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.803ms
+Self CPU time total: 1.844ms
 Self CUDA time total: 11.040us
 
 
@@ -4171,19 +4171,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     125.762us      1085.65%     125.762us     125.762us             1  
-                               hf_kernels_causal_conv1d        16.83%      79.002us        98.82%     463.887us     463.887us       0.000us         0.00%      15.360us      15.360us             1  
-                                         CausalConv1dFn        15.62%      73.323us        81.99%     384.885us     128.295us       0.000us         0.00%      15.360us       5.120us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.37%      25.230us        59.95%     281.430us      93.810us      11.584us       100.00%      15.360us       5.120us             3  
-void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      11.584us       100.00%      11.584us       3.861us             3  
-                                Activity Buffer Request        20.79%      97.593us        20.79%      97.593us      97.593us       3.776us        32.60%       3.776us       3.776us             1  
-                                       aten::empty_like         1.82%       8.531us         6.42%      30.132us      10.044us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.60%      21.601us         4.60%      21.601us       7.200us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        33.79%     158.607us        33.79%     158.607us      52.869us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.18%       5.530us         1.18%       5.530us       5.530us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     124.988us      1097.16%     124.988us     124.988us             1  
+                               hf_kernels_causal_conv1d        14.68%      75.042us        98.95%     505.802us     505.802us       0.000us         0.00%      15.232us      15.232us             1  
+                                         CausalConv1dFn        15.20%      77.712us        84.27%     430.760us     143.587us       0.000us         0.00%      15.232us       5.077us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.71%      24.091us        63.54%     324.777us     108.259us      11.392us       100.00%      15.232us       5.077us             3  
+void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      11.392us       100.00%      11.392us       3.797us             3  
+                                Activity Buffer Request        26.66%     136.263us        26.66%     136.263us     136.263us       3.840us        33.71%       3.840us       3.840us             1  
+                                       aten::empty_like         1.46%       7.441us         5.53%      28.271us       9.424us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.08%      20.830us         4.08%      20.830us       6.943us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        32.17%     164.423us        32.17%     164.423us      54.808us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.05%       5.351us         1.05%       5.351us       5.351us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 469.417us
-Self CUDA time total: 11.584us
+Self CPU time total: 511.153us
+Self CUDA time total: 11.392us
 
 
 
@@ -4193,19 +4193,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     134.046us       264.80%     134.046us     134.046us             1  
-                               hf_kernels_causal_conv1d         4.19%      76.942us        99.71%       1.832ms       1.832ms       0.000us         0.00%      84.285us      84.285us             1  
-                                         CausalConv1dFn         4.10%      75.381us        95.52%       1.755ms     585.044us       0.000us         0.00%      84.285us      28.095us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.30%      23.952us        89.70%       1.648ms     549.413us      50.622us       100.00%      84.285us      28.095us             3  
-void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      50.622us       100.00%      50.622us      16.874us             3  
-                                Activity Buffer Request        78.71%       1.446ms        78.71%       1.446ms       1.446ms      33.663us        66.50%      33.663us      33.663us             1  
-                                       aten::empty_like         0.54%       9.991us         1.71%      31.512us      10.504us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.17%      21.521us         1.17%      21.521us       7.174us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         9.69%     177.966us         9.69%     177.966us      59.322us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.29%       5.380us         0.29%       5.380us       5.380us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     131.775us       262.12%     131.775us     131.775us             1  
+                               hf_kernels_causal_conv1d         8.81%      77.263us        99.39%     871.362us     871.362us       0.000us         0.00%      83.680us      83.680us             1  
+                                         CausalConv1dFn         8.68%      76.121us        90.57%     794.099us     264.700us       0.000us         0.00%      83.680us      27.893us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         3.02%      26.501us        78.58%     688.947us     229.649us      50.272us       100.00%      83.680us      27.893us             3  
+void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      50.272us       100.00%      50.272us      16.757us             3  
+                                Activity Buffer Request        55.77%     488.972us        55.77%     488.972us     488.972us      33.408us        66.45%      33.408us      33.408us             1  
+                                       aten::empty_like         0.92%       8.040us         3.31%      29.031us       9.677us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         2.39%      20.991us         2.39%      20.991us       6.997us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        19.79%     173.474us        19.79%     173.474us      57.825us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.61%       5.370us         0.61%       5.370us       5.370us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.837ms
-Self CUDA time total: 50.622us
+Self CPU time total: 876.732us
+Self CUDA time total: 50.272us
 
 
 
@@ -4215,19 +4215,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     124.639us       241.17%     124.639us     124.639us             1  
-                               hf_kernels_causal_conv1d        12.15%      73.652us        99.08%     600.632us     600.632us       0.000us         0.00%      86.272us      86.272us             1  
-                                         CausalConv1dFn        11.76%      71.283us        86.93%     526.980us     175.660us       0.000us         0.00%      86.272us      28.757us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.05%      24.580us        70.27%     425.965us     141.988us      51.680us       100.00%      86.272us      28.757us             3  
-void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      51.680us       100.00%      51.680us      17.227us             3  
-                                Activity Buffer Request        38.62%     234.139us        38.62%     234.139us     234.139us      34.592us        66.93%      34.592us      34.592us             1  
-                                       aten::empty_like         1.31%       7.952us         4.90%      29.732us       9.911us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         3.59%      21.780us         3.59%      21.780us       7.260us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        27.59%     167.246us        27.59%     167.246us      55.749us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.92%       5.560us         0.92%       5.560us       5.560us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     127.295us       247.23%     127.295us     127.295us             1  
+                               hf_kernels_causal_conv1d        15.09%      77.332us        99.04%     507.562us     507.562us       0.000us         0.00%      86.016us      86.016us             1  
+                                         CausalConv1dFn        14.68%      75.241us        83.95%     430.230us     143.410us       0.000us         0.00%      86.016us      28.672us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.05%      25.861us        63.40%     324.927us     108.309us      51.488us       100.00%      86.016us      28.672us             3  
+void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      51.488us       100.00%      51.488us      17.163us             3  
+                                Activity Buffer Request        25.26%     129.463us        25.26%     129.463us     129.463us      34.528us        67.06%      34.528us      34.528us             1  
+                                       aten::empty_like         1.67%       8.561us         5.87%      30.062us      10.021us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.20%      21.501us         4.20%      21.501us       7.167us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        33.09%     169.603us        33.09%     169.603us      56.534us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.96%       4.929us         0.96%       4.929us       4.929us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 606.192us
-Self CUDA time total: 51.680us
+Self CPU time total: 512.491us
+Self CUDA time total: 51.488us
 
 
 
@@ -4237,18 +4237,18 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     117.184us      3001.64%     117.184us     117.184us             1  
-                               hf_kernels_causal_conv1d        11.99%      71.634us        99.07%     591.661us     591.661us       0.000us         0.00%       5.152us       5.152us             1  
-                                         CausalConv1dFn        11.65%      69.552us        87.08%     520.027us     173.342us       0.000us         0.00%       5.152us       1.717us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.09%      24.400us        70.30%     419.834us     139.945us       3.904us       100.00%       5.152us       1.717us             3  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     121.214us      3104.87%     121.214us     121.214us             1  
+                               hf_kernels_causal_conv1d         8.71%      75.123us        99.37%     856.672us     856.672us       0.000us         0.00%       5.184us       5.184us             1  
+                                         CausalConv1dFn         8.55%      73.741us        90.66%     781.549us     260.516us       0.000us         0.00%       5.184us       1.728us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         2.92%      25.150us        78.63%     677.857us     225.952us       3.904us       100.00%       5.184us       1.728us             3  
 void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.904us       100.00%       3.904us       1.301us             3  
-                                Activity Buffer Request        39.52%     236.029us        39.52%     236.029us     236.029us       1.248us        31.97%       1.248us       1.248us             1  
-                                       aten::empty_like         1.39%       8.281us         5.13%      30.641us      10.214us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         3.74%      22.360us         3.74%      22.360us       7.453us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        26.69%     159.405us        26.69%     159.405us      53.135us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.93%       5.550us         0.93%       5.550us       5.550us       0.000us         0.00%       0.000us       0.000us             1  
+                                Activity Buffer Request        56.24%     484.832us        56.24%     484.832us     484.832us       1.280us        32.79%       1.280us       1.280us             1  
+                                       aten::empty_like         1.08%       9.311us         3.47%      29.951us       9.984us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         2.39%      20.640us         2.39%      20.640us       6.880us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        19.47%     167.875us        19.47%     167.875us      55.958us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.63%       5.440us         0.63%       5.440us       5.440us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 597.211us
+Self CPU time total: 862.112us
 Self CUDA time total: 3.904us
 
 
@@ -4259,19 +4259,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     129.214us      3308.94%     129.214us     129.214us             1  
-                               hf_kernels_causal_conv1d        14.44%      74.841us        98.93%     512.678us     512.678us       0.000us         0.00%       5.154us       5.154us             1  
-                                         CausalConv1dFn        14.14%      73.283us        84.49%     437.837us     145.946us       0.000us         0.00%       5.154us       1.718us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         6.57%      34.031us        64.55%     334.472us     111.491us       3.905us       100.00%       5.154us       1.718us             3  
-void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.905us       100.00%       3.905us       1.302us             3  
-                                Activity Buffer Request        27.83%     144.225us        27.83%     144.225us     144.225us       1.249us        31.98%       1.249us       1.249us             1  
-                                       aten::empty_like         1.69%       8.750us         5.81%      30.082us      10.027us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.12%      21.332us         4.12%      21.332us       7.111us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        30.15%     156.216us        30.15%     156.216us      52.072us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.07%       5.520us         1.07%       5.520us       5.520us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     121.438us      3086.10%     121.438us     121.438us             1  
+                               hf_kernels_causal_conv1d        15.37%      74.422us        98.89%     478.921us     478.921us       0.000us         0.00%       5.183us       5.183us             1  
+                                         CausalConv1dFn        15.69%      75.972us        83.52%     404.499us     134.833us       0.000us         0.00%       5.183us       1.728us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.44%      26.330us        61.72%     298.936us      99.645us       3.935us       100.00%       5.183us       1.728us             3  
+void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.935us       100.00%       3.935us       1.312us             3  
+                                Activity Buffer Request        23.74%     114.963us        23.74%     114.963us     114.963us       1.248us        31.72%       1.248us       1.248us             1  
+                                       aten::empty_like         1.57%       7.609us         6.11%      29.591us       9.864us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.54%      21.982us         4.54%      21.982us       7.327us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        32.55%     157.643us        32.55%     157.643us      52.548us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.11%       5.391us         1.11%       5.391us       5.391us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 518.198us
-Self CUDA time total: 3.905us
+Self CPU time total: 484.312us
+Self CUDA time total: 3.935us
 
 
 
@@ -4281,19 +4281,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     118.525us      2939.61%     118.525us     118.525us             1  
-                               hf_kernels_causal_conv1d        13.97%      75.404us        99.13%     534.960us     534.960us       0.000us         0.00%       5.376us       5.376us             1  
-                                         CausalConv1dFn        13.10%      70.683us        85.16%     459.556us     153.185us       0.000us         0.00%       5.376us       1.792us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.73%      25.549us        66.42%     358.442us     119.481us       4.032us       100.00%       5.376us       1.792us             3  
-void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.032us       100.00%       4.032us       1.344us             3  
-                                Activity Buffer Request        32.81%     177.046us        32.81%     177.046us     177.046us       1.344us        33.33%       1.344us       1.344us             1  
-                                       aten::empty_like         1.62%       8.721us         5.64%      30.431us      10.144us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.02%      21.710us         4.02%      21.710us       7.237us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        28.88%     155.847us        28.88%     155.847us      51.949us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.87%       4.710us         0.87%       4.710us       4.710us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     152.157us      3744.94%     152.157us     152.157us             1  
+                               hf_kernels_causal_conv1d        10.88%      77.931us        99.21%     710.327us     710.327us       0.000us         0.00%       5.407us       5.407us             1  
+                                         CausalConv1dFn        11.39%      81.522us        88.32%     632.396us     210.799us       0.000us         0.00%       5.407us       1.802us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         3.86%      27.639us        72.73%     520.742us     173.581us       4.063us       100.00%       5.407us       1.802us             3  
+void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.063us       100.00%       4.063us       1.354us             3  
+                                Activity Buffer Request        44.05%     315.408us        44.05%     315.408us     315.408us       1.344us        33.08%       1.344us       1.344us             1  
+                                       aten::empty_like         1.15%       8.200us         4.21%      30.132us      10.044us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.06%      21.932us         3.06%      21.932us       7.311us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        24.82%     177.695us        24.82%     177.695us      59.232us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.79%       5.681us         0.79%       5.681us       5.681us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 539.670us
-Self CUDA time total: 4.032us
+Self CPU time total: 716.008us
+Self CUDA time total: 4.063us
 
 
 
@@ -4303,19 +4303,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     115.905us      2852.70%     115.905us     115.905us             1  
-                               hf_kernels_causal_conv1d        16.16%      74.143us        98.83%     453.315us     453.315us       0.000us         0.00%       5.407us       5.407us             1  
-                                         CausalConv1dFn        14.93%      68.471us        82.67%     379.172us     126.391us       0.000us         0.00%       5.407us       1.802us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.63%      25.811us        61.32%     281.280us      93.760us       4.063us       100.00%       5.407us       1.802us             3  
-void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.063us       100.00%       4.063us       1.354us             3  
-                                Activity Buffer Request        21.83%     100.113us        21.83%     100.113us     100.113us       1.344us        33.08%       1.344us       1.344us             1  
-                                       aten::empty_like         1.88%       8.641us         6.41%      29.421us       9.807us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.53%      20.780us         4.53%      20.780us       6.927us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        33.87%     155.356us        33.87%     155.356us      51.785us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.17%       5.370us         1.17%       5.370us       5.370us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     119.936us      2951.18%     119.936us     119.936us             1  
+                               hf_kernels_causal_conv1d        15.86%      75.552us        99.00%     471.672us     471.672us       0.000us         0.00%       5.440us       5.440us             1  
+                                         CausalConv1dFn        16.03%      76.383us        83.14%     396.120us     132.040us       0.000us         0.00%       5.440us       1.813us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.35%      25.480us        61.26%     291.866us      97.289us       4.064us       100.00%       5.440us       1.813us             3  
+void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.064us       100.00%       4.064us       1.355us             3  
+                                Activity Buffer Request        23.14%     110.243us        23.14%     110.243us     110.243us       1.376us        33.86%       1.376us       1.376us             1  
+                                       aten::empty_like         1.53%       7.269us         5.85%      27.871us       9.290us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.32%      20.602us         4.32%      20.602us       6.867us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        32.77%     156.143us        32.77%     156.143us      52.048us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.00%       4.760us         1.00%       4.760us       4.760us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 458.685us
-Self CUDA time total: 4.063us
+Self CPU time total: 476.432us
+Self CUDA time total: 4.064us
 
 
 
@@ -4325,19 +4325,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     122.141us      2271.97%     122.141us     122.141us             1  
-                               hf_kernels_causal_conv1d        11.82%      75.911us        99.15%     636.712us     636.712us       0.000us         0.00%       7.200us       7.200us             1  
-                                         CausalConv1dFn        11.01%      70.722us        87.33%     560.801us     186.934us       0.000us         0.00%       7.200us       2.400us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.24%      27.210us        71.66%     460.136us     153.379us       5.376us       100.00%       7.200us       2.400us             3  
-void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       5.376us       100.00%       5.376us       1.792us             3  
-                                Activity Buffer Request        43.06%     276.540us        43.06%     276.540us     276.540us       1.824us        33.93%       1.824us       1.824us             1  
-                                       aten::empty_like         1.25%       8.002us         4.66%      29.943us       9.981us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         3.42%      21.941us         3.42%      21.941us       7.314us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        24.35%     156.386us        24.35%     156.386us      52.129us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.85%       5.440us         0.85%       5.440us       5.440us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     129.888us      2401.78%     129.888us     129.888us             1  
+                               hf_kernels_causal_conv1d        13.50%     106.873us        99.32%     785.980us     785.980us       0.000us         0.00%       7.264us       7.264us             1  
+                                         CausalConv1dFn        10.04%      79.422us        85.81%     679.107us     226.369us       0.000us         0.00%       7.264us       2.421us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         3.32%      26.310us        72.10%     570.564us     190.188us       5.408us       100.00%       7.264us       2.421us             3  
+void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       5.408us       100.00%       5.408us       1.803us             3  
+                                Activity Buffer Request        48.81%     386.260us        48.81%     386.260us     386.260us       1.856us        34.32%       1.856us       1.856us             1  
+                                       aten::empty_like         1.01%       7.981us         3.68%      29.121us       9.707us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         2.67%      21.140us         2.67%      21.140us       7.047us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        19.96%     157.994us        19.96%     157.994us      52.665us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.68%       5.410us         0.68%       5.410us       5.410us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 642.152us
-Self CUDA time total: 5.376us
+Self CPU time total: 791.390us
+Self CUDA time total: 5.408us
 
 
 
@@ -4347,19 +4347,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     117.822us      2140.66%     117.822us     117.822us             1  
-                               hf_kernels_causal_conv1d        16.30%      72.964us        98.80%     442.326us     442.326us       0.000us         0.00%       7.392us       7.392us             1  
-                                         CausalConv1dFn        16.19%      72.472us        82.50%     369.362us     123.121us       0.000us         0.00%       7.392us       2.464us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.63%      25.211us        59.71%     267.319us      89.106us       5.504us       100.00%       7.392us       2.464us             3  
-void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       5.504us       100.00%       5.504us       1.835us             3  
-                                Activity Buffer Request        19.35%      86.632us        19.35%      86.632us      86.632us       1.888us        34.30%       1.888us       1.888us             1  
-                                       aten::empty_like         1.85%       8.281us         6.60%      29.571us       9.857us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.76%      21.290us         4.76%      21.290us       7.097us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        34.73%     155.476us        34.73%     155.476us      51.825us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.20%       5.391us         1.20%       5.391us       5.391us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     118.463us      2151.92%     118.463us     118.463us             1  
+                               hf_kernels_causal_conv1d        19.47%      96.181us        98.96%     488.812us     488.812us       0.000us         0.00%       7.393us       7.393us             1  
+                                         CausalConv1dFn        15.19%      75.044us        79.49%     392.631us     130.877us       0.000us         0.00%       7.393us       2.464us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.31%      26.241us        58.39%     288.397us      96.132us       5.505us       100.00%       7.393us       2.464us             3  
+void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       5.505us       100.00%       5.505us       1.835us             3  
+                                Activity Buffer Request        21.50%     106.222us        21.50%     106.222us     106.222us       1.888us        34.30%       1.888us       1.888us             1  
+                                       aten::empty_like         1.50%       7.390us         5.91%      29.190us       9.730us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.41%      21.800us         4.41%      21.800us       7.267us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        31.57%     155.934us        31.57%     155.934us      51.978us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.04%       5.140us         1.04%       5.140us       5.140us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 447.717us
-Self CUDA time total: 5.504us
+Self CPU time total: 493.952us
+Self CUDA time total: 5.505us
 
 
 
@@ -4369,19 +4369,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     125.728us       716.97%     125.728us     125.728us             1  
-                               hf_kernels_causal_conv1d        11.80%      75.821us        99.14%     637.002us     637.002us       0.000us         0.00%      23.392us      23.392us             1  
-                                         CausalConv1dFn        11.24%      72.243us        87.34%     561.181us     187.060us       0.000us         0.00%      23.392us       7.797us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.08%      26.210us        71.24%     457.746us     152.582us      17.536us       100.00%      23.392us       7.797us             3  
-void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.536us       100.00%      17.536us       5.845us             3  
-                                Activity Buffer Request        42.92%     275.770us        42.92%     275.770us     275.770us       5.856us        33.39%       5.856us       5.856us             1  
-                                       aten::empty_like         1.45%       9.311us         4.85%      31.192us      10.397us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         3.41%      21.881us         3.41%      21.881us       7.294us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        24.24%     155.766us        24.24%     155.766us      51.922us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.86%       5.550us         0.86%       5.550us       5.550us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     129.279us       741.28%     129.279us     129.279us             1  
+                               hf_kernels_causal_conv1d         5.08%      91.861us        99.73%       1.805ms       1.805ms       0.000us         0.00%      23.296us      23.296us             1  
+                                         CausalConv1dFn         4.24%      76.815us        94.65%       1.713ms     571.078us       0.000us         0.00%      23.296us       7.765us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.42%      25.791us        88.76%       1.607ms     535.516us      17.440us       100.00%      23.296us       7.765us             3  
+void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.440us       100.00%      17.440us       5.813us             3  
+                                Activity Buffer Request        78.65%       1.424ms        78.65%       1.424ms       1.424ms       5.856us        33.58%       5.856us       5.856us             1  
+                                       aten::empty_like         0.47%       8.500us         1.65%      29.870us       9.957us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.18%      21.370us         1.18%      21.370us       7.123us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         8.68%     157.163us         8.68%     157.163us      52.388us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.27%       4.911us         0.27%       4.911us       4.911us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 642.552us
-Self CUDA time total: 17.536us
+Self CPU time total: 1.810ms
+Self CUDA time total: 17.440us
 
 
 
@@ -4391,19 +4391,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     123.901us       690.22%     123.901us     123.901us             1  
-                               hf_kernels_causal_conv1d        16.99%      75.711us        98.78%     440.245us     440.245us       0.000us         0.00%      23.967us      23.967us             1  
-                                         CausalConv1dFn        15.81%      70.471us        81.79%     364.534us     121.511us       0.000us         0.00%      23.967us       7.989us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.65%      25.192us        59.40%     264.751us      88.250us      17.951us       100.00%      23.967us       7.989us             3  
-void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.951us       100.00%      17.951us       5.984us             3  
-                                Activity Buffer Request        18.53%      82.593us        18.53%      82.593us      82.593us       6.016us        33.51%       6.016us       6.016us             1  
-                                       aten::empty_like         1.75%       7.802us         6.58%      29.312us       9.771us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.83%      21.510us         4.83%      21.510us       7.170us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        35.22%     156.966us        35.22%     156.966us      52.322us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.22%       5.440us         1.22%       5.440us       5.440us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     139.324us       772.01%     139.324us     139.324us             1  
+                               hf_kernels_causal_conv1d        18.68%      93.362us        99.02%     494.883us     494.883us       0.000us         0.00%      24.095us      24.095us             1  
+                                         CausalConv1dFn        17.38%      86.843us        80.34%     401.521us     133.840us       0.000us         0.00%      24.095us       8.032us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.36%      26.789us        57.15%     285.628us      95.209us      18.047us       100.00%      24.095us       8.032us             3  
+void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      18.047us       100.00%      18.047us       6.016us             3  
+                                Activity Buffer Request        20.49%     102.403us        20.49%     102.403us     102.403us       6.048us        33.51%       6.048us       6.048us             1  
+                                       aten::empty_like         1.48%       7.399us         5.81%      29.050us       9.683us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.33%      21.651us         4.33%      21.651us       7.217us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        31.30%     156.436us        31.30%     156.436us      52.145us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.98%       4.890us         0.98%       4.890us       4.890us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 445.685us
-Self CUDA time total: 17.951us
+Self CPU time total: 499.773us
+Self CUDA time total: 18.047us
 
 
 
@@ -4413,19 +4413,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     131.804us       730.34%     131.804us     131.804us             1  
-                               hf_kernels_causal_conv1d        11.57%      77.592us        99.18%     665.133us     665.133us       0.000us         0.00%      24.094us      24.094us             1  
-                                         CausalConv1dFn        10.93%      73.321us        87.61%     587.541us     195.847us       0.000us         0.00%      24.094us       8.031us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         3.40%      22.811us        71.94%     482.478us     160.826us      18.047us       100.00%      24.094us       8.031us             3  
-void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      18.047us       100.00%      18.047us       6.016us             3  
-                                Activity Buffer Request        44.54%     298.731us        44.54%     298.731us     298.731us       6.047us        33.51%       6.047us       6.047us             1  
-                                       aten::empty_like         1.35%       9.049us         4.73%      31.742us      10.581us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         3.38%      22.693us         3.38%      22.693us       7.564us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        24.00%     160.936us        24.00%     160.936us      53.645us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.82%       5.510us         0.82%       5.510us       5.510us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     135.103us       748.58%     135.103us     135.103us             1  
+                               hf_kernels_causal_conv1d         5.37%      98.434us        99.69%       1.829ms       1.829ms       0.000us         0.00%      24.097us      24.097us             1  
+                                         CausalConv1dFn         4.35%      79.821us        94.33%       1.730ms     576.697us       0.000us         0.00%      24.097us       8.032us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.36%      24.912us        88.33%       1.620ms     540.010us      18.048us       100.00%      24.097us       8.032us             3  
+void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      18.048us       100.00%      18.048us       6.016us             3  
+                                Activity Buffer Request        77.78%       1.427ms        77.78%       1.427ms       1.427ms       6.049us        33.52%       6.049us       6.049us             1  
+                                       aten::empty_like         0.47%       8.550us         1.65%      30.240us      10.080us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.18%      21.690us         1.18%      21.690us       7.230us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         9.19%     168.514us         9.19%     168.514us      56.171us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.31%       5.620us         0.31%       5.620us       5.620us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 670.643us
-Self CUDA time total: 18.047us
+Self CPU time total: 1.834ms
+Self CUDA time total: 18.048us
 
 
 
@@ -4435,19 +4435,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     122.267us       637.87%     122.267us     122.267us             1  
-                               hf_kernels_causal_conv1d        16.94%      75.003us        98.82%     437.665us     437.665us       0.000us         0.00%      25.632us      25.632us             1  
-                                         CausalConv1dFn        15.90%      70.409us        81.89%     362.662us     120.887us       0.000us         0.00%      25.632us       8.544us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.97%      26.462us        59.15%     261.981us      87.327us      19.168us       100.00%      25.632us       8.544us             3  
-void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      19.168us       100.00%      19.168us       6.389us             3  
-                                Activity Buffer Request        18.04%      79.883us        18.04%      79.883us      79.883us       6.464us        33.72%       6.464us       6.464us             1  
-                                       aten::empty_like         2.06%       9.102us         6.84%      30.272us      10.091us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.78%      21.170us         4.78%      21.170us       7.057us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        35.14%     155.636us        35.14%     155.636us      51.879us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.18%       5.220us         1.18%       5.220us       5.220us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     130.684us       694.54%     130.684us     130.684us             1  
+                               hf_kernels_causal_conv1d        18.98%      97.223us        99.02%     507.183us     507.183us       0.000us         0.00%      25.120us      25.120us             1  
+                                         CausalConv1dFn        14.58%      74.692us        80.04%     409.960us     136.653us       0.000us         0.00%      25.120us       8.373us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         6.51%      33.321us        59.71%     305.838us     101.946us      18.816us       100.00%      25.120us       8.373us             3  
+void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      18.816us       100.00%      18.816us       6.272us             3  
+                                Activity Buffer Request        22.33%     114.353us        22.33%     114.353us     114.353us       6.304us        33.50%       6.304us       6.304us             1  
+                                       aten::empty_like         1.71%       8.769us         5.75%      29.430us       9.810us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.03%      20.661us         4.03%      20.661us       6.887us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        30.88%     158.164us        30.88%     158.164us      52.721us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.98%       5.010us         0.98%       5.010us       5.010us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 442.885us
-Self CUDA time total: 19.168us
+Self CPU time total: 512.193us
+Self CUDA time total: 18.816us
 
 
 
@@ -4457,19 +4457,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         4.25%      77.621us        99.69%       1.822ms       1.822ms       0.000us         0.00%     163.007us     163.007us             1  
-                                         CausalConv1dFn         4.18%      76.374us        95.44%       1.744ms     581.328us       0.000us         0.00%     163.007us      54.336us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.34%      24.550us        89.50%       1.636ms     545.169us      97.983us       100.00%     163.007us      54.336us             3  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     142.719us       145.66%     142.719us     142.719us             1  
-void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      97.983us       100.00%      97.983us      32.661us             3  
-                                Activity Buffer Request        79.33%       1.450ms        79.33%       1.450ms       1.450ms      65.024us        66.36%      65.024us      65.024us             1  
-                                       aten::empty_like         0.51%       9.271us         1.76%      32.102us      10.701us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.25%      22.831us         1.25%      22.831us       7.610us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         8.83%     161.275us         8.83%     161.275us      53.758us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.31%       5.740us         0.31%       5.740us       5.740us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         6.14%     112.394us        99.70%       1.825ms       1.825ms       0.000us         0.00%     162.754us     162.754us             1  
+                                         CausalConv1dFn         4.41%      80.651us        93.56%       1.713ms     570.927us       0.000us         0.00%     162.754us      54.251us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.37%      25.010us        87.54%       1.603ms     534.193us      97.985us       100.00%     162.754us      54.251us             3  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     144.737us       147.71%     144.737us     144.737us             1  
+void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      97.985us       100.00%      97.985us      32.662us             3  
+                                Activity Buffer Request        77.36%       1.416ms        77.36%       1.416ms       1.416ms      64.769us        66.10%      64.769us      64.769us             1  
+                                       aten::empty_like         0.49%       8.901us         1.61%      29.551us       9.850us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.13%      20.650us         1.13%      20.650us       6.883us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         8.82%     161.445us         8.82%     161.445us      53.815us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.30%       5.480us         0.30%       5.480us       5.480us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.827ms
-Self CUDA time total: 97.983us
+Self CPU time total: 1.831ms
+Self CUDA time total: 97.985us
 
 
 
@@ -4479,19 +4479,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d        17.00%      78.131us        98.89%     454.476us     454.476us       0.000us         0.00%     164.440us     164.440us             1  
-                                         CausalConv1dFn        15.89%      73.024us        81.89%     376.345us     125.448us       0.000us         0.00%     164.440us      54.813us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.76%      26.451us        59.63%     274.060us      91.353us      98.939us       100.00%     164.440us      54.813us             3  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     139.130us       140.62%     139.130us     139.130us             1  
-void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      98.939us       100.00%      98.939us      32.980us             3  
-                                Activity Buffer Request        18.20%      83.643us        18.20%      83.643us      83.643us      65.501us        66.20%      65.501us      65.501us             1  
-                                       aten::empty_like         1.75%       8.030us         6.37%      29.261us       9.754us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.62%      21.231us         4.62%      21.231us       7.077us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        35.68%     163.966us        35.68%     163.966us      54.655us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.11%       5.111us         1.11%       5.111us       5.111us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d        19.17%      96.654us        98.90%     498.573us     498.573us       0.000us         0.00%     163.900us     163.900us             1  
+                                         CausalConv1dFn        15.33%      77.291us        79.73%     401.919us     133.973us       0.000us         0.00%     163.900us      54.633us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.17%      26.053us        58.73%     296.088us      98.696us      98.813us       100.00%     163.900us      54.633us             3  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     133.981us       135.59%     133.981us     133.981us             1  
+void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      98.813us       100.00%      98.813us      32.938us             3  
+                                Activity Buffer Request        22.39%     112.882us        22.39%     112.882us     112.882us      65.087us        65.87%      65.087us      65.087us             1  
+                                       aten::empty_like         1.55%       7.820us         5.66%      28.540us       9.513us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.11%      20.720us         4.11%      20.720us       6.907us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        31.17%     157.153us        31.17%     157.153us      52.384us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.10%       5.550us         1.10%       5.550us       5.550us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 459.587us
-Self CUDA time total: 98.939us
+Self CPU time total: 504.123us
+Self CUDA time total: 98.813us
 
 
 impl                     wl                  p50(ms)  ok
@@ -4502,11 +4502,11 @@ hf_kernels_causal_conv1d cuda_B2_D2048_S2048_W4     0.05  True
 hf_kernels_causal_conv1d cuda_B2_D2048_S512_W2     0.05  True
 hf_kernels_causal_conv1d cuda_B2_D2048_S512_W4     0.05  True
 hf_kernels_causal_conv1d cuda_B2_D64_S128_W2     0.05  True
-hf_kernels_causal_conv1d cuda_B2_D64_S128_W4     0.05  True
-hf_kernels_causal_conv1d cuda_B2_D64_S2048_W2     0.05  True
+hf_kernels_causal_conv1d cuda_B2_D64_S128_W4     0.06  True
+hf_kernels_causal_conv1d cuda_B2_D64_S2048_W2     0.06  True
 hf_kernels_causal_conv1d cuda_B2_D64_S2048_W4     0.05  True
-hf_kernels_causal_conv1d cuda_B2_D64_S512_W2     0.05  True
-hf_kernels_causal_conv1d cuda_B2_D64_S512_W4     0.05  True
+hf_kernels_causal_conv1d cuda_B2_D64_S512_W2     0.06  True
+hf_kernels_causal_conv1d cuda_B2_D64_S512_W4     0.06  True
 hf_kernels_causal_conv1d cuda_B4_D2048_S128_W2     0.05  True
 hf_kernels_causal_conv1d cuda_B4_D2048_S128_W4     0.05  True
 hf_kernels_causal_conv1d cuda_B4_D2048_S2048_W2     0.05  True
@@ -4517,20 +4517,18 @@ hf_kernels_causal_conv1d cuda_B4_D64_S128_W2     0.05  True
 hf_kernels_causal_conv1d cuda_B4_D64_S128_W4     0.05  True
 hf_kernels_causal_conv1d cuda_B4_D64_S2048_W2     0.05  True
 hf_kernels_causal_conv1d cuda_B4_D64_S2048_W4     0.05  True
-hf_kernels_causal_conv1d cuda_B4_D64_S512_W2     0.05  True
+hf_kernels_causal_conv1d cuda_B4_D64_S512_W2     0.06  True
 hf_kernels_causal_conv1d cuda_B4_D64_S512_W4     0.05  True
 </pre></div>
 <div class="uv-install-logs" id="uv-logs-benchmark">
 <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 <div class="uv-logs-content" style="display: none;">
-Downloading hf-xet (3.2MiB)
- Downloading hf-xet
-Installed 52 packages in 211ms
+Installed 15 packages in 14ms
 </div>
 </div>
 <div class="cell-stderr">Fetching 11 files:   0%|          | 0/11 [00:00&lt;?, ?it/s]
-Fetching 11 files:  64%|██████▎   | 7/11 [00:02&lt;00:01,  3.26it/s]
-Fetching 11 files: 100%|██████████| 11/11 [00:02&lt;00:00,  5.12it/s]</div>
+Fetching 11 files:  64%|██████▎   | 7/11 [00:01&lt;00:01,  3.95it/s]
+Fetching 11 files: 100%|██████████| 11/11 [00:01&lt;00:00,  6.21it/s]</div>
 <div class="cell-artifacts">
 <h4>Artifacts:</h4>
 <a href="artifacts/benchmark/causal_conv1d.jsonl" class="artifact" target="_blank">causal_conv1d.jsonl</a>
diff --git a/causal_conv1d/impls/torch_causal_conv1d.html b/causal_conv1d/impls/torch_causal_conv1d.html
index a14fe1d8732e839025c8dec1c927653b8a3a02ff..2dd29f110a68d2d6a2cb36ff92b20f1c54eab64b 100644
--- a/causal_conv1d/impls/torch_causal_conv1d.html
+++ b/causal_conv1d/impls/torch_causal_conv1d.html
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
 <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> | 
-Cell: nv | 0.21s
+Cell: nv | 0.24s
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
 <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3887,7 +3887,7 @@ Cell: nv | 0.21s
 </div>
 </div>
 <div id="output-nv" class="cell-output">
-<div class="cell-stdout"><pre class="stdout-text">Tue Oct 28 14:08:09 2025       
+<div class="cell-stdout"><pre class="stdout-text">Wed Oct 29 14:27:09 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -3896,7 +3896,7 @@ Cell: nv | 0.21s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   28C    P0             80W /  350W |       0MiB /  46068MiB |     19%      Default |
+| N/A   33C    P0            109W /  350W |       0MiB /  46068MiB |    100%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3918,9 +3918,9 @@ Cell: nv | 0.21s
 <span class="collapse-indicators">
 <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span> 
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
- <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
+ <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
 </span> | 
-Cell: benchmark | 3.63s
+Cell: benchmark | 7.23s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3982,29 +3982,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     448.254us      2311.66%     448.254us     448.254us             1  
-                                            torch_eager        10.53%     223.197us        99.60%       2.112ms       2.112ms       0.000us         0.00%      21.727us      21.727us             1  
-                                               aten::to         0.57%      12.032us        79.33%       1.682ms     280.390us       0.000us         0.00%      14.304us       2.384us             6  
-                                         aten::_to_copy         1.82%      38.532us        78.77%       1.670ms     278.384us       0.000us         0.00%      14.304us       2.384us             6  
-                                            aten::copy_         2.94%      62.272us        74.35%       1.577ms     262.784us      11.968us        61.72%      14.304us       2.384us             6  
-                                           aten::conv1d         0.36%       7.640us         7.60%     161.165us      53.722us       0.000us         0.00%       7.423us       2.474us             3  
-                                      aten::convolution         0.68%      14.400us         7.24%     153.525us      51.175us       0.000us         0.00%       7.423us       2.474us             3  
-                                     aten::_convolution         1.64%      34.820us         6.56%     139.125us      46.375us       0.000us         0.00%       7.423us       2.474us             3  
-                                aten::_conv_depthwise2d         1.64%      34.779us         4.03%      85.503us      28.501us       7.423us        38.28%       7.423us       2.474us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.423us        38.28%       7.423us       2.474us             3  
-void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.304us        32.51%       6.304us       2.101us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.664us        29.21%       5.664us       1.888us             3  
-                                Activity Buffer Request        68.27%       1.448ms        68.27%       1.448ms       1.448ms       2.336us        12.05%       2.336us       2.336us             1  
-                                    aten::empty_strided         2.60%      55.071us         2.60%      55.071us       9.178us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         4.35%      92.254us         4.35%      92.254us      10.250us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         1.39%      29.522us         1.76%      37.262us       4.140us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.59%      12.410us         0.59%      12.410us       0.827us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.52%      10.960us         0.52%      10.960us       3.653us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.67%      14.291us         0.67%      14.291us       4.764us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.39%       8.321us         0.47%       9.881us       3.294us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     465.824us      2410.10%     465.824us     465.824us             1  
+                                            torch_eager        10.38%     221.098us        99.69%       2.123ms       2.123ms       0.000us         0.00%      21.632us      21.632us             1  
+                                               aten::to         0.54%      11.460us        78.80%       1.678ms     279.633us       0.000us         0.00%      14.304us       2.384us             6  
+                                         aten::_to_copy         2.14%      45.672us        78.26%       1.666ms     277.723us       0.000us         0.00%      14.304us       2.384us             6  
+                                            aten::copy_         2.97%      63.201us        73.51%       1.565ms     260.883us      12.000us        62.09%      14.304us       2.384us             6  
+                                           aten::conv1d         0.45%       9.560us         8.33%     177.314us      59.105us       0.000us         0.00%       7.328us       2.443us             3  
+                                      aten::convolution         0.76%      16.270us         7.88%     167.754us      55.918us       0.000us         0.00%       7.328us       2.443us             3  
+                                     aten::_convolution         1.63%      34.781us         7.11%     151.484us      50.495us       0.000us         0.00%       7.328us       2.443us             3  
+                                aten::_conv_depthwise2d         2.18%      46.460us         4.51%      96.001us      32.000us       7.328us        37.91%       7.328us       2.443us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.328us        37.91%       7.328us       2.443us             3  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.272us        32.45%       6.272us       2.091us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.728us        29.64%       5.728us       1.909us             3  
+                                Activity Buffer Request        67.39%       1.435ms        67.39%       1.435ms       1.435ms       2.304us        11.92%       2.304us       2.304us             1  
+                                    aten::empty_strided         2.60%      55.371us         2.60%      55.371us       9.228us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         4.37%      93.031us         4.37%      93.031us      10.337us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         1.44%      30.589us         1.81%      38.620us       4.291us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.63%      13.371us         0.63%      13.371us       0.891us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.55%      11.811us         0.55%      11.811us       3.937us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.56%      11.940us         0.56%      11.940us       3.980us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.37%       7.972us         0.46%       9.712us       3.237us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.121ms
-Self CUDA time total: 19.391us
+Self CPU time total: 2.129ms
+Self CUDA time total: 19.328us
 
 
 
@@ -4014,29 +4014,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     334.942us      1707.32%     334.942us     334.942us             1  
-                                            torch_eager         7.85%     148.604us        99.72%       1.887ms       1.887ms       0.000us         0.00%      21.731us      21.731us             1  
-                                               aten::to         0.32%       6.111us        83.97%       1.589ms     264.793us       0.000us         0.00%      13.731us       2.288us             6  
-                                         aten::_to_copy         1.27%      24.112us        83.64%       1.583ms     263.774us       0.000us         0.00%      13.731us       2.288us             6  
-                                            aten::copy_         2.68%      50.691us        80.81%       1.529ms     254.829us      11.618us        59.22%      13.731us       2.288us             6  
-                                           aten::conv1d         0.29%       5.540us         6.41%     121.373us      40.458us       0.000us         0.00%       8.000us       2.667us             3  
-                                      aten::convolution         0.50%       9.420us         6.12%     115.833us      38.611us       0.000us         0.00%       8.000us       2.667us             3  
-                                     aten::_convolution         1.30%      24.670us         5.62%     106.413us      35.471us       0.000us         0.00%       8.000us       2.667us             3  
-                                aten::_conv_depthwise2d         1.20%      22.792us         3.44%      65.133us      21.711us       8.000us        40.78%       8.000us       2.667us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       8.000us        40.78%       8.000us       2.667us             3  
-void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.049us        30.83%       6.049us       2.016us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.569us        28.39%       5.569us       1.856us             3  
-                                Activity Buffer Request        75.63%       1.431ms        75.63%       1.431ms       1.431ms       2.113us        10.77%       2.113us       2.113us             1  
-                                    aten::empty_strided         1.56%      29.560us         1.56%      29.560us       4.927us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         3.72%      70.343us         3.72%      70.343us       7.816us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.90%      17.091us         1.18%      22.301us       2.478us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.48%       9.090us         0.48%       9.090us       0.606us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.50%       9.490us         0.50%       9.490us       3.163us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.52%       9.830us         0.52%       9.830us       3.277us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.34%       6.400us         0.42%       8.020us       2.673us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     332.863us      1691.38%     332.863us     332.863us             1  
+                                            torch_eager         6.60%     126.115us        99.71%       1.906ms       1.906ms       0.000us         0.00%      21.792us      21.792us             1  
+                                               aten::to         0.31%       5.930us        85.54%       1.635ms     272.467us       0.000us         0.00%      13.760us       2.293us             6  
+                                         aten::_to_copy         1.30%      24.791us        85.23%       1.629ms     271.478us       0.000us         0.00%      13.760us       2.293us             6  
+                                            aten::copy_         2.71%      51.809us        82.30%       1.573ms     262.158us      11.648us        59.19%      13.760us       2.293us             6  
+                                           aten::conv1d         0.31%       5.929us         6.17%     117.852us      39.284us       0.000us         0.00%       8.032us       2.677us             3  
+                                      aten::convolution         0.53%      10.111us         5.86%     111.923us      37.308us       0.000us         0.00%       8.032us       2.677us             3  
+                                     aten::_convolution         1.20%      22.951us         5.33%     101.812us      33.937us       0.000us         0.00%       8.032us       2.677us             3  
+                                aten::_conv_depthwise2d         1.20%      22.860us         3.35%      64.021us      21.340us       8.032us        40.81%       8.032us       2.677us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       8.032us        40.81%       8.032us       2.677us             3  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.080us        30.89%       6.080us       2.027us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.568us        28.29%       5.568us       1.856us             3  
+                                Activity Buffer Request        77.00%       1.472ms        77.00%       1.472ms       1.472ms       2.112us        10.73%       2.112us       2.112us             1  
+                                    aten::empty_strided         1.63%      31.132us         1.63%      31.132us       5.189us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         3.70%      70.762us         3.70%      70.762us       7.862us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.87%      16.659us         1.16%      22.190us       2.466us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.46%       8.781us         0.46%       8.781us       0.585us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.55%      10.521us         0.55%      10.521us       3.507us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.49%       9.390us         0.49%       9.390us       3.130us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.29%       5.540us         0.35%       6.670us       2.223us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.892ms
-Self CUDA time total: 19.618us
+Self CPU time total: 1.911ms
+Self CUDA time total: 19.680us
 
 
 
@@ -4046,29 +4046,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     333.691us      1797.81%     333.691us     333.691us             1  
-                                            torch_eager         7.79%     146.606us        99.69%       1.876ms       1.876ms       0.000us         0.00%      20.481us      20.481us             1  
-                                               aten::to         0.31%       5.760us        84.09%       1.582ms     263.706us       0.000us         0.00%      13.569us       2.262us             6  
-                                         aten::_to_copy         1.25%      23.550us        83.79%       1.576ms     262.746us       0.000us         0.00%      13.569us       2.262us             6  
-                                            aten::copy_         2.67%      50.153us        80.95%       1.523ms     253.847us      11.649us        62.76%      13.569us       2.262us             6  
-                                           aten::conv1d         0.31%       5.780us         6.33%     119.033us      39.678us       0.000us         0.00%       6.912us       2.304us             3  
-                                      aten::convolution         0.52%       9.800us         6.02%     113.253us      37.751us       0.000us         0.00%       6.912us       2.304us             3  
-                                     aten::_convolution         1.28%      24.000us         5.50%     103.453us      34.484us       0.000us         0.00%       6.912us       2.304us             3  
-                                aten::_conv_depthwise2d         1.15%      21.640us         3.37%      63.473us      21.158us       6.912us        37.24%       6.912us       2.304us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       6.912us        37.24%       6.912us       2.304us             3  
-void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.953us        32.07%       5.953us       1.984us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.696us        30.69%       5.696us       1.899us             3  
-                                Activity Buffer Request        75.77%       1.426ms        75.77%       1.426ms       1.426ms       1.920us        10.34%       1.920us       1.920us             1  
-                                    aten::empty_strided         1.59%      29.840us         1.59%      29.840us       4.973us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         3.79%      71.241us         3.79%      71.241us       7.916us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.92%      17.220us         1.19%      22.362us       2.485us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.47%       8.782us         0.47%       8.782us       0.585us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.49%       9.312us         0.49%       9.312us       3.104us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.46%       8.581us         0.46%       8.581us       2.860us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.33%       6.290us         0.41%       7.740us       2.580us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     355.199us      1913.89%     355.199us     355.199us             1  
+                                            torch_eager         6.67%     125.171us        99.71%       1.872ms       1.872ms       0.000us         0.00%      20.511us      20.511us             1  
+                                               aten::to         0.32%       6.091us        84.23%       1.581ms     263.570us       0.000us         0.00%      13.600us       2.267us             6  
+                                         aten::_to_copy         1.32%      24.859us        83.90%       1.575ms     262.555us       0.000us         0.00%      13.600us       2.267us             6  
+                                            aten::copy_         2.70%      50.760us        80.88%       1.518ms     253.083us      11.648us        62.76%      13.600us       2.267us             6  
+                                           aten::conv1d         0.30%       5.670us         7.37%     138.423us      46.141us       0.000us         0.00%       6.911us       2.304us             3  
+                                      aten::convolution         0.52%       9.720us         7.07%     132.753us      44.251us       0.000us         0.00%       6.911us       2.304us             3  
+                                     aten::_convolution         1.24%      23.210us         6.55%     123.033us      41.011us       0.000us         0.00%       6.911us       2.304us             3  
+                                aten::_conv_depthwise2d         1.26%      23.712us         4.48%      84.033us      28.011us       6.911us        37.24%       6.911us       2.304us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       6.911us        37.24%       6.911us       2.304us             3  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.984us        32.24%       5.984us       1.995us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.664us        30.52%       5.664us       1.888us             3  
+                                Activity Buffer Request        75.59%       1.419ms        75.59%       1.419ms       1.419ms       1.952us        10.52%       1.952us       1.952us             1  
+                                    aten::empty_strided         1.70%      31.973us         1.70%      31.973us       5.329us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         3.83%      72.002us         3.83%      72.002us       8.000us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.89%      16.661us         1.15%      21.682us       2.409us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.48%       8.941us         0.48%       8.941us       0.596us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.49%      28.041us         1.49%      28.041us       9.347us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.47%       8.840us         0.47%       8.840us       2.947us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.32%       5.960us         0.40%       7.470us       2.490us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.882ms
-Self CUDA time total: 18.561us
+Self CPU time total: 1.878ms
+Self CUDA time total: 18.559us
 
 
 
@@ -4078,29 +4078,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     341.628us      1741.67%     341.628us     341.628us             1  
-                                            torch_eager         6.79%     135.276us        99.76%       1.989ms       1.989ms       0.000us         0.00%      21.759us      21.759us             1  
-                                               aten::to         0.31%       6.091us        85.44%       1.703ms     283.911us       0.000us         0.00%      14.111us       2.352us             6  
-                                         aten::_to_copy         1.20%      23.892us        85.13%       1.697ms     282.896us       0.000us         0.00%      14.111us       2.352us             6  
-                                            aten::copy_         2.47%      49.180us        82.37%       1.642ms     273.716us      11.967us        61.01%      14.111us       2.352us             6  
-                                           aten::conv1d         0.29%       5.740us         6.09%     121.414us      40.471us       0.000us         0.00%       7.648us       2.549us             3  
-                                      aten::convolution         0.55%      11.061us         5.80%     115.674us      38.558us       0.000us         0.00%       7.648us       2.549us             3  
-                                     aten::_convolution         1.19%      23.780us         5.25%     104.613us      34.871us       0.000us         0.00%       7.648us       2.549us             3  
-                                aten::_conv_depthwise2d         1.14%      22.750us         3.26%      64.953us      21.651us       7.648us        38.99%       7.648us       2.549us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.648us        38.99%       7.648us       2.549us             3  
-void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.175us        31.48%       6.175us       2.058us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.792us        29.53%       5.792us       1.931us             3  
-                                Activity Buffer Request        68.82%       1.372ms        68.82%       1.372ms       1.372ms       2.144us        10.93%       2.144us       2.144us             1  
-                                    aten::empty_strided         1.56%      31.190us         1.56%      31.190us       5.198us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        12.22%     243.619us        12.22%     243.619us      27.069us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.88%      17.629us         1.14%      22.660us       2.518us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.44%       8.782us         0.44%       8.782us       0.585us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.48%       9.630us         0.48%       9.630us       3.210us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.50%       9.941us         0.50%       9.941us       3.314us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.34%       6.720us         0.41%       8.110us       2.703us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     335.841us      1714.87%     335.841us     335.841us             1  
+                                            torch_eager         6.09%     125.084us        99.75%       2.047ms       2.047ms       0.000us         0.00%      21.728us      21.728us             1  
+                                               aten::to         0.29%       6.012us        86.59%       1.777ms     296.210us       0.000us         0.00%      14.049us       2.341us             6  
+                                         aten::_to_copy         1.18%      24.318us        86.30%       1.771ms     295.209us       0.000us         0.00%      14.049us       2.341us             6  
+                                            aten::copy_         2.44%      50.170us        83.64%       1.717ms     286.105us      11.905us        60.79%      14.049us       2.341us             6  
+                                           aten::conv1d         0.29%       5.981us         5.73%     117.633us      39.211us       0.000us         0.00%       7.679us       2.560us             3  
+                                      aten::convolution         0.48%       9.909us         5.44%     111.652us      37.217us       0.000us         0.00%       7.679us       2.560us             3  
+                                     aten::_convolution         1.11%      22.712us         4.96%     101.743us      33.914us       0.000us         0.00%       7.679us       2.560us             3  
+                                aten::_conv_depthwise2d         1.08%      22.231us         3.11%      63.781us      21.260us       7.679us        39.21%       7.679us       2.560us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.679us        39.21%       7.679us       2.560us             3  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.176us        31.54%       6.176us       2.059us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.729us        29.25%       5.729us       1.910us             3  
+                                Activity Buffer Request        70.17%       1.440ms        70.17%       1.440ms       1.440ms       2.144us        10.95%       2.144us       2.144us             1  
+                                    aten::empty_strided         1.48%      30.301us         1.48%      30.301us       5.050us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        12.02%     246.676us        12.02%     246.676us      27.408us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.85%      17.450us         1.12%      22.930us       2.548us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.44%       8.940us         0.44%       8.940us       0.596us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.47%       9.630us         0.47%       9.630us       3.210us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.56%      11.490us         0.56%      11.490us       3.830us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.28%       5.710us         0.34%       6.930us       2.310us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.994ms
-Self CUDA time total: 19.615us
+Self CPU time total: 2.053ms
+Self CUDA time total: 19.584us
 
 
 
@@ -4110,29 +4110,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     341.213us      1403.01%     341.213us     341.213us             1  
-                                            torch_eager         7.36%     148.867us        99.73%       2.016ms       2.016ms       0.000us         0.00%      26.560us      26.560us             1  
-                                               aten::to         0.30%       6.030us        84.88%       1.716ms     285.962us       0.000us         0.00%      15.168us       2.528us             6  
-                                         aten::_to_copy         1.20%      24.229us        84.58%       1.710ms     284.956us       0.000us         0.00%      15.168us       2.528us             6  
-                                            aten::copy_         2.44%      49.414us        81.85%       1.655ms     275.782us      12.928us        53.16%      15.168us       2.528us             6  
-                                           aten::conv1d         0.28%       5.730us         5.99%     121.174us      40.391us       0.000us         0.00%      11.392us       3.797us             3  
-                                      aten::convolution         0.47%       9.480us         5.71%     115.444us      38.481us       0.000us         0.00%      11.392us       3.797us             3  
-                                     aten::_convolution         1.14%      23.073us         5.24%     105.964us      35.321us       0.000us         0.00%      11.392us       3.797us             3  
-                                aten::_conv_depthwise2d         1.05%      21.189us         3.24%      65.411us      21.804us      11.392us        46.84%      11.392us       3.797us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      11.392us        46.84%      11.392us       3.797us             3  
-void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.592us        27.11%       6.592us       2.197us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.336us        26.05%       6.336us       2.112us             3  
-                                Activity Buffer Request        70.12%       1.417ms        70.12%       1.417ms       1.417ms       2.240us         9.21%       2.240us       2.240us             1  
-                                    aten::empty_strided         1.52%      30.820us         1.52%      30.820us       5.137us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        10.45%     211.347us        10.45%     211.347us      23.483us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.95%      19.208us         1.23%      24.829us       2.759us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.46%       9.241us         0.46%       9.241us       0.616us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.47%       9.482us         0.47%       9.482us       3.161us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.55%      11.190us         0.55%      11.190us       3.730us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.34%       6.961us         0.41%       8.361us       2.787us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     329.565us      1339.31%     329.565us     329.565us             1  
+                                            torch_eager         6.13%     122.184us        99.75%       1.990ms       1.990ms       0.000us         0.00%      26.911us      26.911us             1  
+                                               aten::to         0.30%       5.979us        86.40%       1.724ms     287.259us       0.000us         0.00%      15.359us       2.560us             6  
+                                         aten::_to_copy         1.37%      27.300us        86.10%       1.718ms     286.262us       0.000us         0.00%      15.359us       2.560us             6  
+                                            aten::copy_         2.45%      48.801us        83.22%       1.660ms     276.655us      13.055us        53.05%      15.359us       2.560us             6  
+                                           aten::conv1d         0.29%       5.841us         5.86%     116.932us      38.977us       0.000us         0.00%      11.552us       3.851us             3  
+                                      aten::convolution         0.50%       9.929us         5.57%     111.091us      37.030us       0.000us         0.00%      11.552us       3.851us             3  
+                                     aten::_convolution         1.16%      23.192us         5.07%     101.162us      33.721us       0.000us         0.00%      11.552us       3.851us             3  
+                                aten::_conv_depthwise2d         1.12%      22.341us         3.11%      62.030us      20.677us      11.552us        46.95%      11.552us       3.851us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      11.552us        46.95%      11.552us       3.851us             3  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.688us        27.18%       6.688us       2.229us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.367us        25.87%       6.367us       2.122us             3  
+                                Activity Buffer Request        71.71%       1.430ms        71.71%       1.430ms       1.430ms       2.304us         9.36%       2.304us       2.304us             1  
+                                    aten::empty_strided         1.52%      30.342us         1.52%      30.342us       5.057us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.06%     200.744us        10.06%     200.744us      22.305us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.86%      17.251us         1.14%      22.681us       2.520us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.45%       9.051us         0.45%       9.051us       0.603us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.48%       9.579us         0.48%       9.579us       3.193us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.50%      10.050us         0.50%      10.050us       3.350us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.30%       6.019us         0.36%       7.270us       2.423us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.022ms
-Self CUDA time total: 24.320us
+Self CPU time total: 1.995ms
+Self CUDA time total: 24.607us
 
 
 
@@ -4142,29 +4142,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     334.330us      1285.14%     334.330us     334.330us             1  
-                                            torch_eager         7.10%     143.875us        99.74%       2.020ms       2.020ms       0.000us         0.00%      28.255us      28.255us             1  
-                                               aten::to         0.28%       5.680us        85.25%       1.727ms     287.810us       0.000us         0.00%      15.232us       2.539us             6  
-                                         aten::_to_copy         1.18%      23.873us        84.97%       1.721ms     286.863us       0.000us         0.00%      15.232us       2.539us             6  
-                                            aten::copy_         2.45%      49.640us        82.36%       1.668ms     278.038us      12.992us        49.94%      15.232us       2.539us             6  
-                                           aten::conv1d         0.29%       5.889us         5.94%     120.414us      40.138us       0.000us         0.00%      13.023us       4.341us             3  
-                                      aten::convolution         0.46%       9.401us         5.65%     114.525us      38.175us       0.000us         0.00%      13.023us       4.341us             3  
-                                     aten::_convolution         1.22%      24.611us         5.19%     105.124us      35.041us       0.000us         0.00%      13.023us       4.341us             3  
-                                aten::_conv_depthwise2d         1.06%      21.480us         3.19%      64.562us      21.521us      13.023us        50.06%      13.023us       4.341us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      13.023us        50.06%      13.023us       4.341us             3  
-void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.624us        25.46%       6.624us       2.208us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.368us        24.48%       6.368us       2.123us             3  
-                                Activity Buffer Request        71.17%       1.442ms        71.17%       1.442ms       1.442ms       2.240us         8.61%       2.240us       2.240us             1  
-                                    aten::empty_strided         1.44%      29.082us         1.44%      29.082us       4.847us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         9.85%     199.548us         9.85%     199.548us      22.172us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.91%      18.470us         1.17%      23.650us       2.628us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.44%       8.970us         0.44%       8.970us       0.598us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.51%      10.400us         0.51%      10.400us       3.467us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.50%      10.200us         0.50%      10.200us       3.400us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.30%       6.091us         0.38%       7.621us       2.540us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     358.812us      1379.20%     358.812us     358.812us             1  
+                                            torch_eager         6.94%     139.423us        99.75%       2.005ms       2.005ms       0.000us         0.00%      28.256us      28.256us             1  
+                                               aten::to         0.33%       6.550us        85.45%       1.717ms     286.205us       0.000us         0.00%      15.199us       2.533us             6  
+                                         aten::_to_copy         1.20%      24.182us        85.13%       1.711ms     285.114us       0.000us         0.00%      15.199us       2.533us             6  
+                                            aten::copy_         2.59%      52.130us        82.30%       1.654ms     275.648us      12.959us        49.81%      15.199us       2.533us             6  
+                                           aten::conv1d         0.30%       6.120us         5.97%     119.993us      39.998us       0.000us         0.00%      13.057us       4.352us             3  
+                                      aten::convolution         0.48%       9.660us         5.67%     113.873us      37.958us       0.000us         0.00%      13.057us       4.352us             3  
+                                     aten::_convolution         1.13%      22.802us         5.19%     104.213us      34.738us       0.000us         0.00%      13.057us       4.352us             3  
+                                aten::_conv_depthwise2d         1.09%      21.932us         3.25%      65.242us      21.747us      13.057us        50.19%      13.057us       4.352us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      13.057us        50.19%      13.057us       4.352us             3  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.623us        25.46%       6.623us       2.208us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.336us        24.35%       6.336us       2.112us             3  
+                                Activity Buffer Request        70.68%       1.420ms        70.68%       1.420ms       1.420ms       2.240us         8.61%       2.240us       2.240us             1  
+                                    aten::empty_strided         1.62%      32.611us         1.62%      32.611us       5.435us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.17%     204.364us        10.17%     204.364us      22.707us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.88%      17.647us         1.15%      23.189us       2.577us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.47%       9.382us         0.47%       9.382us       0.625us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.58%      11.651us         0.58%      11.651us       3.884us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.44%       8.769us         0.44%       8.769us       2.923us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.32%       6.420us         0.39%       7.890us       2.630us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.026ms
-Self CUDA time total: 26.015us
+Self CPU time total: 2.010ms
+Self CUDA time total: 26.016us
 
 
 
@@ -4174,29 +4174,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     340.315us       888.50%     340.315us     340.315us             1  
-                                            torch_eager         7.29%     147.016us        99.74%       2.012ms       2.012ms       0.000us         0.00%      40.894us      40.894us             1  
-                                           aten::conv1d         0.29%       5.920us         5.91%     119.264us      39.755us       0.000us         0.00%      22.496us       7.499us             3  
-                                      aten::convolution         0.47%       9.411us         5.62%     113.344us      37.781us       0.000us         0.00%      22.496us       7.499us             3  
-                                     aten::_convolution         1.19%      23.960us         5.15%     103.933us      34.644us       0.000us         0.00%      22.496us       7.499us             3  
-                                aten::_conv_depthwise2d         1.11%      22.310us         3.18%      64.143us      21.381us      22.496us        58.73%      22.496us       7.499us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      22.496us        58.73%      22.496us       7.499us             3  
-                                               aten::to         0.29%       5.851us        85.12%       1.717ms     286.238us       0.000us         0.00%      18.398us       3.066us             6  
-                                         aten::_to_copy         1.17%      23.549us        84.83%       1.712ms     285.263us       0.000us         0.00%      18.398us       3.066us             6  
-                                            aten::copy_         2.43%      48.960us        82.11%       1.657ms     276.121us      15.806us        41.27%      18.398us       3.066us             6  
-void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.416us        21.97%       8.416us       2.805us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.390us        19.29%       7.390us       2.463us             3  
-                                Activity Buffer Request        70.87%       1.430ms        70.87%       1.430ms       1.430ms       2.592us         6.77%       2.592us       2.592us             1  
-                                    aten::empty_strided         1.55%      31.301us         1.55%      31.301us       5.217us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         9.91%     199.938us         9.91%     199.938us      22.215us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.87%      17.540us         1.13%      22.711us       2.523us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.44%       8.912us         0.44%       8.912us       0.594us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.47%       9.390us         0.47%       9.390us       3.130us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.51%      10.361us         0.51%      10.361us       3.454us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.30%       6.100us         0.37%       7.550us       2.517us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     328.896us       853.65%     328.896us     328.896us             1  
+                                            torch_eager         6.29%     121.493us        99.73%       1.928ms       1.928ms       0.000us         0.00%      41.088us      41.088us             1  
+                                           aten::conv1d         0.31%       5.961us         6.00%     115.903us      38.634us       0.000us         0.00%      22.688us       7.563us             3  
+                                      aten::convolution         0.50%       9.600us         5.69%     109.942us      36.647us       0.000us         0.00%      22.688us       7.563us             3  
+                                     aten::_convolution         1.16%      22.510us         5.19%     100.342us      33.447us       0.000us         0.00%      22.688us       7.563us             3  
+                                aten::_conv_depthwise2d         1.17%      22.551us         3.25%      62.881us      20.960us      22.688us        58.89%      22.688us       7.563us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      22.688us        58.89%      22.688us       7.563us             3  
+                                               aten::to         0.33%       6.421us        86.08%       1.664ms     277.308us       0.000us         0.00%      18.400us       3.067us             6  
+                                         aten::_to_copy         1.25%      24.161us        85.75%       1.657ms     276.238us       0.000us         0.00%      18.400us       3.067us             6  
+                                            aten::copy_         2.57%      49.759us        82.93%       1.603ms     267.166us      15.840us        41.11%      18.400us       3.067us             6  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.448us        21.93%       8.448us       2.816us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.392us        19.19%       7.392us       2.464us             3  
+                                Activity Buffer Request        71.07%       1.374ms        71.07%       1.374ms       1.374ms       2.560us         6.64%       2.560us       2.560us             1  
+                                    aten::empty_strided         1.57%      30.271us         1.57%      30.271us       5.045us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.43%     201.525us        10.43%     201.525us      22.392us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.86%      16.701us         1.14%      22.001us       2.445us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.45%       8.751us         0.45%       8.751us       0.583us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.48%       9.290us         0.48%       9.290us       3.097us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.47%       9.060us         0.47%       9.060us       3.020us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.28%       5.459us         0.35%       6.690us       2.230us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.018ms
-Self CUDA time total: 38.302us
+Self CPU time total: 1.933ms
+Self CUDA time total: 38.528us
 
 
 
@@ -4206,29 +4206,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     363.388us       882.35%     363.388us     363.388us             1  
-                                            torch_eager         8.20%     165.958us        99.73%       2.020ms       2.020ms       0.000us         0.00%      43.808us      43.808us             1  
-                                           aten::conv1d         0.32%       6.510us         6.06%     122.733us      40.911us       0.000us         0.00%      25.408us       8.469us             3  
-                                      aten::convolution         0.48%       9.730us         5.74%     116.223us      38.741us       0.000us         0.00%      25.408us       8.469us             3  
-                                     aten::_convolution         1.17%      23.611us         5.26%     106.493us      35.498us       0.000us         0.00%      25.408us       8.469us             3  
-                                aten::_conv_depthwise2d         1.11%      22.549us         3.28%      66.422us      22.141us      25.408us        61.69%      25.408us       8.469us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      25.408us        61.69%      25.408us       8.469us             3  
-                                               aten::to         0.31%       6.220us        83.98%       1.701ms     283.450us       0.000us         0.00%      18.400us       3.067us             6  
-                                         aten::_to_copy         1.16%      23.591us        83.68%       1.694ms     282.413us       0.000us         0.00%      18.400us       3.067us             6  
-                                            aten::copy_         2.51%      50.781us        81.00%       1.640ms     273.388us      15.776us        38.31%      18.400us       3.067us             6  
-void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.352us        20.28%       8.352us       2.784us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.424us        18.03%       7.424us       2.475us             3  
-                                Activity Buffer Request        69.68%       1.411ms        69.68%       1.411ms       1.411ms       2.624us         6.37%       2.624us       2.624us             1  
-                                    aten::empty_strided         1.51%      30.560us         1.51%      30.560us       5.093us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         9.99%     202.397us         9.99%     202.397us      22.489us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.88%      17.759us         1.14%      23.000us       2.556us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.46%       9.250us         0.46%       9.250us       0.617us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.51%      10.382us         0.51%      10.382us       3.461us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.48%       9.651us         0.48%       9.651us       3.217us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.33%       6.630us         0.40%       8.160us       2.720us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     334.458us       810.83%     334.458us     334.458us             1  
+                                            torch_eager         6.32%     125.394us        99.75%       1.978ms       1.978ms       0.000us         0.00%      43.841us      43.841us             1  
+                                           aten::conv1d         0.30%       5.899us         5.88%     116.562us      38.854us       0.000us         0.00%      25.600us       8.533us             3  
+                                      aten::convolution         0.49%       9.810us         5.58%     110.663us      36.888us       0.000us         0.00%      25.600us       8.533us             3  
+                                     aten::_convolution         1.13%      22.411us         5.09%     100.853us      33.618us       0.000us         0.00%      25.600us       8.533us             3  
+                                aten::_conv_depthwise2d         1.14%      22.520us         3.20%      63.392us      21.131us      25.600us        62.06%      25.600us       8.533us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      25.600us        62.06%      25.600us       8.533us             3  
+                                               aten::to         0.30%       5.959us        86.14%       1.708ms     284.675us       0.000us         0.00%      18.241us       3.040us             6  
+                                         aten::_to_copy         1.33%      26.372us        85.84%       1.702ms     283.682us       0.000us         0.00%      18.241us       3.040us             6  
+                                            aten::copy_         2.49%      49.420us        83.02%       1.646ms     274.363us      15.649us        37.94%      18.241us       3.040us             6  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.321us        20.17%       8.321us       2.774us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.328us        17.77%       7.328us       2.443us             3  
+                                Activity Buffer Request        71.51%       1.418ms        71.51%       1.418ms       1.418ms       2.592us         6.28%       2.592us       2.592us             1  
+                                    aten::empty_strided         1.49%      29.540us         1.49%      29.540us       4.923us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.06%     199.427us        10.06%     199.427us      22.159us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.92%      18.199us         1.18%      23.330us       2.592us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.44%       8.651us         0.44%       8.651us       0.577us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.54%      10.640us         0.54%      10.640us       3.547us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.48%       9.610us         0.48%       9.610us       3.203us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.28%       5.590us         0.34%       6.770us       2.257us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.025ms
-Self CUDA time total: 41.184us
+Self CPU time total: 1.983ms
+Self CUDA time total: 41.249us
 
 
 
@@ -4238,29 +4238,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     352.830us       343.38%     352.830us     352.830us             1  
-                                            torch_eager         7.15%     144.983us        99.76%       2.023ms       2.023ms       0.000us         0.00%     108.768us     108.768us             1  
-                                           aten::conv1d         0.29%       5.781us         5.92%     120.074us      40.025us       0.000us         0.00%      70.432us      23.477us             3  
-                                      aten::convolution         0.47%       9.599us         5.64%     114.293us      38.098us       0.000us         0.00%      70.432us      23.477us             3  
-                                     aten::_convolution         1.14%      23.149us         5.16%     104.694us      34.898us       0.000us         0.00%      70.432us      23.477us             3  
-                                aten::_conv_depthwise2d         1.16%      23.581us         3.22%      65.212us      21.737us      70.432us        68.55%      70.432us      23.477us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      70.432us        68.55%      70.432us      23.477us             3  
-                                               aten::to         0.30%       6.111us        85.26%       1.729ms     288.085us       0.000us         0.00%      38.336us       6.389us             6  
-                                         aten::_to_copy         1.62%      32.820us        84.95%       1.722ms     287.067us       0.000us         0.00%      38.336us       6.389us             6  
-                                            aten::copy_         2.46%      49.781us        81.90%       1.660ms     276.745us      32.320us        31.45%      38.336us       6.389us             6  
-void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      17.696us        17.22%      17.696us       5.899us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      14.624us        14.23%      14.624us       4.875us             3  
-                                Activity Buffer Request        70.70%       1.433ms        70.70%       1.433ms       1.433ms       6.016us         5.85%       6.016us       6.016us             1  
-                                    aten::empty_strided         1.44%      29.111us         1.44%      29.111us       4.852us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         9.89%     200.449us         9.89%     200.449us      22.272us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.88%      17.943us         1.16%      23.512us       2.612us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.46%       9.330us         0.46%       9.330us       0.622us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.47%       9.471us         0.47%       9.471us       3.157us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.45%       9.050us         0.45%       9.050us       3.017us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.32%       6.391us         0.39%       7.911us       2.637us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     338.849us       326.92%     338.849us     338.849us             1  
+                                            torch_eager         5.95%     117.585us        99.74%       1.970ms       1.970ms       0.000us         0.00%     109.697us     109.697us             1  
+                                           aten::conv1d         0.30%       5.970us         6.05%     119.502us      39.834us       0.000us         0.00%      71.232us      23.744us             3  
+                                      aten::convolution         0.49%       9.700us         5.75%     113.532us      37.844us       0.000us         0.00%      71.232us      23.744us             3  
+                                     aten::_convolution         1.15%      22.781us         5.26%     103.832us      34.611us       0.000us         0.00%      71.232us      23.744us             3  
+                                aten::_conv_depthwise2d         1.18%      23.259us         3.31%      65.420us      21.807us      71.232us        68.72%      71.232us      23.744us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      71.232us        68.72%      71.232us      23.744us             3  
+                                               aten::to         0.31%       6.199us        86.38%       1.706ms     284.313us       0.000us         0.00%      38.465us       6.411us             6  
+                                         aten::_to_copy         1.31%      25.891us        86.06%       1.700ms     283.280us       0.000us         0.00%      38.465us       6.411us             6  
+                                            aten::copy_         2.57%      50.812us        83.17%       1.643ms     273.758us      32.417us        31.28%      38.465us       6.411us             6  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      17.760us        17.13%      17.760us       5.920us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      14.657us        14.14%      14.657us       4.886us             3  
+                                Activity Buffer Request        71.61%       1.414ms        71.61%       1.414ms       1.414ms       6.048us         5.84%       6.048us       6.048us             1  
+                                    aten::empty_strided         1.58%      31.240us         1.58%      31.240us       5.207us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.13%     200.155us        10.13%     200.155us      22.239us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.87%      17.181us         1.15%      22.621us       2.513us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.45%       8.941us         0.45%       8.941us       0.596us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.51%      10.050us         0.51%      10.050us       3.350us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.47%       9.370us         0.47%       9.370us       3.123us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.28%       5.551us         0.35%       6.851us       2.284us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.027ms
-Self CUDA time total: 102.752us
+Self CPU time total: 1.975ms
+Self CUDA time total: 103.649us
 
 
 
@@ -4270,29 +4270,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     330.363us       292.97%     330.363us     330.363us             1  
-                                            torch_eager        16.10%     118.634us        99.31%     731.955us     731.955us       0.000us         0.00%     118.781us     118.781us             1  
-                                           aten::conv1d         0.80%       5.881us        15.92%     117.344us      39.115us       0.000us         0.00%      80.541us      26.847us             3  
-                                      aten::convolution         1.32%       9.760us        15.12%     111.463us      37.154us       0.000us         0.00%      80.541us      26.847us             3  
-                                     aten::_convolution         3.06%      22.540us        13.80%     101.703us      33.901us       0.000us         0.00%      80.541us      26.847us             3  
-                                aten::_conv_depthwise2d         2.83%      20.841us         8.49%      62.593us      20.864us      80.541us        71.42%      80.541us      26.847us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      80.541us        71.42%      80.541us      26.847us             3  
-                                               aten::to         0.79%       5.790us        63.53%     468.255us      78.043us       0.000us         0.00%      38.240us       6.373us             6  
-                                         aten::_to_copy         3.21%      23.660us        62.75%     462.465us      77.078us       0.000us         0.00%      38.240us       6.373us             6  
-                                            aten::copy_         6.76%      49.831us        55.55%     409.415us      68.236us      32.224us        28.58%      38.240us       6.373us             6  
-void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      17.728us        15.72%      17.728us       5.909us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      14.496us        12.86%      14.496us       4.832us             3  
-                                Activity Buffer Request        25.24%     185.996us        25.24%     185.996us     185.996us       6.016us         5.33%       6.016us       6.016us             1  
-                                    aten::empty_strided         3.99%      29.390us         3.99%      29.390us       4.898us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        26.60%     196.028us        26.60%     196.028us      21.781us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.44%      17.960us         3.11%      22.951us       2.550us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.15%       8.461us         1.15%       8.461us       0.564us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.35%       9.931us         1.35%       9.931us       3.310us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.27%       9.381us         1.27%       9.381us       3.127us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.87%       6.430us         1.06%       7.840us       2.613us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     357.597us       314.53%     357.597us     357.597us             1  
+                                            torch_eager         6.01%     120.196us        99.73%       1.995ms       1.995ms       0.000us         0.00%     119.645us     119.645us             1  
+                                           aten::conv1d         0.28%       5.578us         6.85%     137.112us      45.704us       0.000us         0.00%      81.344us      27.115us             3  
+                                      aten::convolution         0.47%       9.452us         6.58%     131.534us      43.845us       0.000us         0.00%      81.344us      27.115us             3  
+                                     aten::_convolution         1.16%      23.298us         6.10%     122.082us      40.694us       0.000us         0.00%      81.344us      27.115us             3  
+                                aten::_conv_depthwise2d         1.16%      23.221us         4.15%      82.932us      27.644us      81.344us        71.55%      81.344us      27.115us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      81.344us        71.55%      81.344us      27.115us             3  
+                                               aten::to         0.33%       6.509us        85.46%       1.710ms     284.935us       0.000us         0.00%      38.301us       6.383us             6  
+                                         aten::_to_copy         1.29%      25.870us        85.14%       1.703ms     283.850us       0.000us         0.00%      38.301us       6.383us             6  
+                                            aten::copy_         2.58%      51.531us        82.27%       1.646ms     274.308us      32.350us        28.45%      38.301us       6.383us             6  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      17.727us        15.59%      17.727us       5.909us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      14.623us        12.86%      14.623us       4.874us             3  
+                                Activity Buffer Request        70.95%       1.419ms        70.95%       1.419ms       1.419ms       5.951us         5.23%       5.951us       5.951us             1  
+                                    aten::empty_strided         1.57%      31.380us         1.57%      31.380us       5.230us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         9.95%     199.044us         9.95%     199.044us      22.116us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.89%      17.740us         1.16%      23.191us       2.577us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.47%       9.433us         0.47%       9.433us       0.629us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.53%      10.531us         0.53%      10.531us       3.510us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.26%      25.130us         1.26%      25.130us       8.377us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.30%       6.010us         0.38%       7.612us       2.537us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 737.005us
-Self CUDA time total: 112.765us
+Self CPU time total: 2.000ms
+Self CUDA time total: 113.694us
 
 
 
@@ -4302,29 +4302,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager        22.21%     170.695us        99.32%     763.366us     763.366us       0.000us         0.00%     430.770us     430.770us             1  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     416.723us       106.46%     416.723us     416.723us             1  
-                                           aten::conv1d         0.77%       5.951us        14.86%     114.225us      38.075us       0.000us         0.00%     251.288us      83.763us             3  
-                                      aten::convolution         1.24%       9.541us        14.09%     108.274us      36.091us       0.000us         0.00%     251.288us      83.763us             3  
-                                     aten::_convolution         2.83%      21.719us        12.85%      98.733us      32.911us       0.000us         0.00%     251.288us      83.763us             3  
-                                aten::_conv_depthwise2d         2.74%      21.061us         7.99%      61.422us      20.474us     251.288us        64.20%     251.288us      83.763us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     251.288us        64.20%     251.288us      83.763us             3  
-                                               aten::to         0.75%       5.750us        58.89%     452.676us      75.446us       0.000us         0.00%     179.482us      29.914us             6  
-                                         aten::_to_copy         3.02%      23.182us        58.15%     446.926us      74.488us       0.000us         0.00%     179.482us      29.914us             6  
-                                            aten::copy_         6.40%      49.211us        51.58%     396.473us      66.079us     140.155us        35.80%     179.482us      29.914us             6  
-void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     100.254us        25.61%     100.254us      33.418us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      39.901us        10.19%      39.901us      13.300us             3  
-                                Activity Buffer Request        22.72%     174.636us        22.72%     174.636us     174.636us      39.327us        10.05%      39.327us      39.327us             1  
-                                    aten::empty_strided         3.55%      27.271us         3.55%      27.271us       4.545us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        25.36%     194.936us        25.36%     194.936us      21.660us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.13%      16.381us         2.81%      21.611us       2.401us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.16%       8.880us         1.16%       8.880us       0.592us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.18%       9.091us         1.18%       9.091us       3.030us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.17%       8.960us         1.17%       8.960us       2.987us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.75%       5.770us         0.94%       7.191us       2.397us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         5.97%     120.782us        97.66%       1.975ms       1.975ms       0.000us         0.00%     434.301us     434.301us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     421.021us       106.85%     421.021us     421.021us             1  
+                                           aten::conv1d         0.30%       6.069us         5.79%     117.202us      39.067us       0.000us         0.00%     251.007us      83.669us             3  
+                                      aten::convolution         0.47%       9.471us         5.49%     111.133us      37.044us       0.000us         0.00%     251.007us      83.669us             3  
+                                     aten::_convolution         1.10%      22.180us         5.03%     101.662us      33.887us       0.000us         0.00%     251.007us      83.669us             3  
+                                aten::_conv_depthwise2d         1.13%      22.779us         3.17%      64.182us      21.394us     251.007us        63.71%     251.007us      83.669us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     251.007us        63.71%     251.007us      83.669us             3  
+                                               aten::to         0.31%       6.200us        84.52%       1.710ms     284.917us       0.000us         0.00%     183.294us      30.549us             6  
+                                         aten::_to_copy         1.19%      24.072us        84.22%       1.703ms     283.884us       0.000us         0.00%     183.294us      30.549us             6  
+                                            aten::copy_         2.45%      49.593us        81.56%       1.650ms     274.942us     143.007us        36.29%     183.294us      30.549us             6  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     102.495us        26.01%     102.495us      34.165us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.512us        10.28%      40.512us      13.504us             3  
+                                Activity Buffer Request        70.36%       1.423ms        70.36%       1.423ms       1.423ms      40.287us        10.22%      40.287us      40.287us             1  
+                                    aten::empty_strided         1.46%      29.579us         1.46%      29.579us       4.930us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         9.86%     199.474us         9.86%     199.474us      22.164us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.84%      17.021us         1.11%      22.432us       2.492us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.45%       9.090us         0.45%       9.090us       0.606us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.48%       9.720us         0.48%       9.720us       3.240us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.45%       9.202us         0.45%       9.202us       3.067us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.28%       5.680us         0.35%       7.060us       2.353us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 768.616us
-Self CUDA time total: 391.443us
+Self CPU time total: 2.023ms
+Self CUDA time total: 394.014us
 
 
 
@@ -4334,29 +4334,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager        13.26%     117.114us        87.73%     774.557us     774.557us       0.000us         0.00%     486.014us     486.014us             1  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     473.342us       105.98%     473.342us     473.342us             1  
-                                           aten::conv1d         0.63%       5.520us        13.02%     114.943us      38.314us       0.000us         0.00%     298.622us      99.541us             3  
-                                      aten::convolution         1.08%       9.570us        12.39%     109.423us      36.474us       0.000us         0.00%     298.622us      99.541us             3  
-                                     aten::_convolution         2.49%      22.001us        11.31%      99.853us      33.284us       0.000us         0.00%     298.622us      99.541us             3  
-                                aten::_conv_depthwise2d         2.40%      21.190us         7.05%      62.252us      20.751us     298.622us        66.86%     298.622us      99.541us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     298.622us        66.86%     298.622us      99.541us             3  
-                                               aten::to         0.65%       5.781us        58.29%     514.667us      85.778us       0.000us         0.00%     187.392us      31.232us             6  
-                                         aten::_to_copy         2.57%      22.699us        57.64%     508.886us      84.814us       0.000us         0.00%     187.392us      31.232us             6  
-                                            aten::copy_         5.62%      49.650us        51.80%     457.366us      76.228us     148.032us        33.14%     187.392us      31.232us             6  
-void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     108.256us        24.24%     108.256us      36.085us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      39.776us         8.91%      39.776us      13.259us             3  
-                                Activity Buffer Request        26.78%     236.449us        26.78%     236.449us     236.449us      39.360us         8.81%      39.360us      39.360us             1  
-                                    aten::empty_strided         3.26%      28.821us         3.26%      28.821us       4.804us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        22.01%     194.327us        22.01%     194.327us      21.592us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.00%      17.701us         2.60%      22.912us       2.546us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.01%       8.901us         1.01%       8.901us       0.593us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.05%       9.311us         1.05%       9.311us       3.104us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.98%       8.691us         0.98%       8.691us       2.897us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.65%       5.750us         0.82%       7.230us       2.410us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         5.89%     122.072us        95.29%       1.975ms       1.975ms       0.000us         0.00%     486.458us     486.458us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     474.010us       106.16%     474.010us     474.010us             1  
+                                           aten::conv1d         0.28%       5.830us         5.59%     115.853us      38.618us       0.000us         0.00%     299.291us      99.764us             3  
+                                      aten::convolution         0.46%       9.610us         5.31%     110.023us      36.674us       0.000us         0.00%     299.291us      99.764us             3  
+                                     aten::_convolution         1.08%      22.439us         4.85%     100.413us      33.471us       0.000us         0.00%     299.291us      99.764us             3  
+                                aten::_conv_depthwise2d         1.04%      21.490us         3.04%      62.983us      20.994us     299.291us        67.03%     299.291us      99.764us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     299.291us        67.03%     299.291us      99.764us             3  
+                                               aten::to         0.31%       6.341us        82.51%       1.710ms     284.962us       0.000us         0.00%     187.167us      31.195us             6  
+                                         aten::_to_copy         1.23%      25.592us        82.20%       1.703ms     283.906us       0.000us         0.00%     187.167us      31.195us             6  
+                                            aten::copy_         2.39%      49.481us        79.48%       1.647ms     274.512us     147.199us        32.97%     187.167us      31.195us             6  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     106.911us        23.94%     106.911us      35.637us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.288us         9.02%      40.288us      13.429us             3  
+                                Activity Buffer Request        68.62%       1.422ms        68.62%       1.422ms       1.422ms      39.968us         8.95%      39.968us      39.968us             1  
+                                    aten::empty_strided         1.48%      30.770us         1.48%      30.770us       5.128us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         9.53%     197.485us         9.53%     197.485us      21.943us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.81%      16.791us         1.08%      22.301us       2.478us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.44%       9.141us         0.44%       9.141us       0.609us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.47%       9.701us         0.47%       9.701us       3.234us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.48%       9.941us         0.48%       9.941us       3.314us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.27%       5.510us         0.33%       6.790us       2.263us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 882.890us
-Self CUDA time total: 446.654us
+Self CPU time total: 2.072ms
+Self CUDA time total: 446.490us
 
 
 
@@ -4366,29 +4366,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     324.122us      1734.57%     324.122us     324.122us             1  
-                                            torch_eager        15.60%     121.627us        99.38%     775.067us     775.067us       0.000us         0.00%      20.574us      20.574us             1  
-                                               aten::to         0.72%       5.589us        65.70%     512.356us      85.393us       0.000us         0.00%      13.343us       2.224us             6  
-                                         aten::_to_copy         2.88%      22.431us        64.98%     506.767us      84.461us       0.000us         0.00%      13.343us       2.224us             6  
-                                            aten::copy_         6.46%      50.411us        58.51%     456.326us      76.054us      11.455us        61.30%      13.343us       2.224us             6  
-                                           aten::conv1d         0.72%       5.580us        14.59%     113.823us      37.941us       0.000us         0.00%       7.231us       2.410us             3  
-                                      aten::convolution         1.19%       9.260us        13.88%     108.243us      36.081us       0.000us         0.00%       7.231us       2.410us             3  
-                                     aten::_convolution         2.87%      22.359us        12.69%      98.983us      32.994us       0.000us         0.00%       7.231us       2.410us             3  
-                                aten::_conv_depthwise2d         2.67%      20.840us         7.84%      61.153us      20.384us       7.231us        38.70%       7.231us       2.410us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.231us        38.70%       7.231us       2.410us             3  
-void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.856us        31.34%       5.856us       1.952us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.599us        29.96%       5.599us       1.866us             3  
-                                Activity Buffer Request        30.21%     235.608us        30.21%     235.608us     235.608us       1.888us        10.10%       1.888us       1.888us             1  
-                                    aten::empty_strided         3.59%      28.010us         3.59%      28.010us       4.668us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        24.63%     192.088us        24.63%     192.088us      21.343us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.29%      17.871us         2.95%      23.001us       2.556us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.13%       8.820us         1.13%       8.820us       0.588us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.21%       9.401us         1.21%       9.401us       3.134us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.17%       9.131us         1.17%       9.131us       3.044us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.75%       5.851us         0.94%       7.321us       2.440us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     358.523us      1924.96%     358.523us     358.523us             1  
+                                            torch_eager        17.94%     139.773us        99.33%     774.049us     774.049us       0.000us         0.00%      20.513us      20.513us             1  
+                                               aten::to         0.94%       7.351us        62.88%     489.983us      81.664us       0.000us         0.00%      13.376us       2.229us             6  
+                                         aten::_to_copy         3.20%      24.930us        61.93%     482.632us      80.439us       0.000us         0.00%      13.376us       2.229us             6  
+                                            aten::copy_         6.90%      53.742us        54.52%     424.881us      70.813us      11.488us        61.68%      13.376us       2.229us             6  
+                                           aten::conv1d         0.75%       5.841us        15.01%     116.973us      38.991us       0.000us         0.00%       7.137us       2.379us             3  
+                                      aten::convolution         1.33%      10.360us        14.26%     111.132us      37.044us       0.000us         0.00%       7.137us       2.379us             3  
+                                     aten::_convolution         3.01%      23.430us        12.93%     100.772us      33.591us       0.000us         0.00%       7.137us       2.379us             3  
+                                aten::_conv_depthwise2d         2.81%      21.882us         7.98%      62.192us      20.731us       7.137us        38.32%       7.137us       2.379us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.137us        38.32%       7.137us       2.379us             3  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.888us        31.61%       5.888us       1.963us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.600us        30.07%       5.600us       1.867us             3  
+                                Activity Buffer Request        24.98%     194.695us        24.98%     194.695us     194.695us       1.888us        10.14%       1.888us       1.888us             1  
+                                    aten::empty_strided         4.21%      32.821us         4.21%      32.821us       5.470us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        25.28%     197.004us        25.28%     197.004us      21.889us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.16%      16.850us         2.84%      22.160us       2.462us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.13%       8.821us         1.13%       8.821us       0.588us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.22%       9.521us         1.22%       9.521us       3.174us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.31%      10.229us         1.31%      10.229us       3.410us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.74%       5.740us         0.90%       7.020us       2.340us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 779.877us
-Self CUDA time total: 18.686us
+Self CPU time total: 779.258us
+Self CUDA time total: 18.625us
 
 
 
@@ -4398,29 +4398,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     316.346us      1628.63%     316.346us     316.346us             1  
-                                            torch_eager        14.51%     117.604us        99.38%     805.188us     805.188us       0.000us         0.00%      21.312us      21.312us             1  
-                                               aten::to         0.69%       5.621us        67.40%     546.068us      91.011us       0.000us         0.00%      13.376us       2.229us             6  
-                                         aten::_to_copy         2.81%      22.789us        66.70%     540.447us      90.075us       0.000us         0.00%      13.376us       2.229us             6  
-                                            aten::copy_         5.89%      47.733us        60.20%     487.757us      81.293us      11.488us        59.14%      13.376us       2.229us             6  
-                                           aten::conv1d         0.69%       5.581us        14.11%     114.294us      38.098us       0.000us         0.00%       7.936us       2.645us             3  
-                                      aten::convolution         1.17%       9.520us        13.42%     108.713us      36.238us       0.000us         0.00%       7.936us       2.645us             3  
-                                     aten::_convolution         2.68%      21.682us        12.24%      99.193us      33.064us       0.000us         0.00%       7.936us       2.645us             3  
-                                aten::_conv_depthwise2d         2.64%      21.391us         7.61%      61.682us      20.561us       7.936us        40.86%       7.936us       2.645us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.936us        40.86%       7.936us       2.645us             3  
-void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.856us        30.15%       5.856us       1.952us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.632us        29.00%       5.632us       1.877us             3  
-                                Activity Buffer Request        33.53%     271.649us        33.53%     271.649us     271.649us       1.888us         9.72%       1.888us       1.888us             1  
-                                    aten::empty_strided         3.69%      29.901us         3.69%      29.901us       4.984us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        23.39%     189.555us        23.39%     189.555us      21.062us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.18%      17.698us         2.81%      22.771us       2.530us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.07%       8.674us         1.07%       8.674us       0.578us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.14%       9.260us         1.14%       9.260us       3.087us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.22%       9.851us         1.22%       9.851us       3.284us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.76%       6.120us         0.93%       7.530us       2.510us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     328.763us      1698.07%     328.763us     328.763us             1  
+                                            torch_eager        14.65%     115.015us        99.34%     779.670us     779.670us       0.000us         0.00%      21.248us      21.248us             1  
+                                               aten::to         0.80%       6.290us        66.21%     519.631us      86.605us       0.000us         0.00%      13.406us       2.234us             6  
+                                         aten::_to_copy         3.14%      24.649us        65.41%     513.341us      85.557us       0.000us         0.00%      13.406us       2.234us             6  
+                                            aten::copy_         6.80%      53.351us        58.20%     456.761us      76.127us      11.519us        59.50%      13.406us       2.234us             6  
+                                           aten::conv1d         0.75%       5.880us        15.10%     118.484us      39.495us       0.000us         0.00%       7.842us       2.614us             3  
+                                      aten::convolution         1.21%       9.513us        14.35%     112.604us      37.535us       0.000us         0.00%       7.842us       2.614us             3  
+                                     aten::_convolution         2.83%      22.229us        13.14%     103.091us      34.364us       0.000us         0.00%       7.842us       2.614us             3  
+                                aten::_conv_depthwise2d         3.15%      24.720us         8.43%      66.141us      22.047us       7.842us        40.50%       7.842us       2.614us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.842us        40.50%       7.842us       2.614us             3  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.887us        30.41%       5.887us       1.962us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.632us        29.09%       5.632us       1.877us             3  
+                                Activity Buffer Request        29.55%     231.946us        29.55%     231.946us     231.946us       1.887us         9.75%       1.887us       1.887us             1  
+                                    aten::empty_strided         4.07%      31.931us         4.07%      31.931us       5.322us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        24.68%     193.684us        24.68%     193.684us      21.520us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.11%      16.541us         2.75%      21.581us       2.398us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.09%       8.568us         1.09%       8.568us       0.571us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.27%       9.951us         1.27%       9.951us       3.317us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.18%       9.250us         1.18%       9.250us       3.083us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.72%       5.642us         0.89%       6.980us       2.327us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 810.248us
-Self CUDA time total: 19.424us
+Self CPU time total: 784.850us
+Self CUDA time total: 19.361us
 
 
 
@@ -4430,29 +4430,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     322.590us      1658.13%     322.590us     322.590us             1  
-                                            torch_eager         6.77%     135.447us        99.76%       1.996ms       1.996ms       0.000us         0.00%      21.631us      21.631us             1  
-                                               aten::to         0.29%       5.801us        85.87%       1.718ms     286.282us       0.000us         0.00%      14.400us       2.400us             6  
-                                         aten::_to_copy         1.16%      23.150us        85.58%       1.712ms     285.315us       0.000us         0.00%      14.400us       2.400us             6  
-                                            aten::copy_         2.46%      49.110us        82.93%       1.659ms     276.491us      12.224us        62.83%      14.400us       2.400us             6  
-                                           aten::conv1d         0.28%       5.690us         5.75%     114.953us      38.318us       0.000us         0.00%       7.231us       2.410us             3  
-                                      aten::convolution         0.48%       9.520us         5.46%     109.263us      36.421us       0.000us         0.00%       7.231us       2.410us             3  
-                                     aten::_convolution         1.10%      21.931us         4.99%      99.743us      33.248us       0.000us         0.00%       7.231us       2.410us             3  
-                                aten::_conv_depthwise2d         1.06%      21.231us         3.12%      62.372us      20.791us       7.231us        37.17%       7.231us       2.410us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.231us        37.17%       7.231us       2.410us             3  
-void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.304us        32.40%       6.304us       2.101us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.920us        30.43%       5.920us       1.973us             3  
-                                Activity Buffer Request        71.98%       1.440ms        71.98%       1.440ms       1.440ms       2.176us        11.18%       2.176us       2.176us             1  
-                                    aten::empty_strided         1.49%      29.791us         1.49%      29.791us       4.965us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         9.66%     193.277us         9.66%     193.277us      21.475us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.86%      17.278us         1.13%      22.539us       2.504us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.45%       9.001us         0.45%       9.001us       0.600us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.46%       9.281us         0.46%       9.281us       3.094us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.43%       8.570us         0.43%       8.570us       2.857us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.29%       5.760us         0.36%       7.200us       2.400us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     330.454us      1698.73%     330.454us     330.454us             1  
+                                            torch_eager        14.50%     115.185us        99.38%     789.290us     789.290us       0.000us         0.00%      21.628us      21.628us             1  
+                                               aten::to         0.75%       5.979us        66.62%     529.132us      88.189us       0.000us         0.00%      14.332us       2.389us             6  
+                                         aten::_to_copy         3.11%      24.732us        65.87%     523.153us      87.192us       0.000us         0.00%      14.332us       2.389us             6  
+                                            aten::copy_         6.75%      53.590us        58.69%     466.101us      77.684us      12.157us        62.49%      14.332us       2.389us             6  
+                                           aten::conv1d         0.72%       5.740us        14.75%     117.122us      39.041us       0.000us         0.00%       7.296us       2.432us             3  
+                                      aten::convolution         1.18%       9.359us        14.02%     111.382us      37.127us       0.000us         0.00%       7.296us       2.432us             3  
+                                     aten::_convolution         2.82%      22.362us        12.85%     102.023us      34.008us       0.000us         0.00%       7.296us       2.432us             3  
+                                aten::_conv_depthwise2d         2.86%      22.741us         8.10%      64.351us      21.450us       7.296us        37.51%       7.296us       2.432us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.296us        37.51%       7.296us       2.432us             3  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.238us        32.07%       6.238us       2.079us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.919us        30.43%       5.919us       1.973us             3  
+                                Activity Buffer Request        30.19%     239.746us        30.19%     239.746us     239.746us       2.175us        11.18%       2.175us       2.175us             1  
+                                    aten::empty_strided         4.07%      32.320us         4.07%      32.320us       5.387us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        24.58%     195.235us        24.58%     195.235us      21.693us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.10%      16.713us         2.76%      21.891us       2.432us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.12%       8.919us         1.12%       8.919us       0.595us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.20%       9.570us         1.20%       9.570us       3.190us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.20%       9.570us         1.20%       9.570us       3.190us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.72%       5.709us         0.89%       7.030us       2.343us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.000ms
-Self CUDA time total: 19.455us
+Self CPU time total: 794.200us
+Self CUDA time total: 19.453us
 
 
 
@@ -4462,29 +4462,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     326.589us      1627.82%     326.589us     326.589us             1  
-                                            torch_eager         7.03%     140.275us        99.72%       1.991ms       1.991ms       0.000us         0.00%      22.207us      22.207us             1  
-                                               aten::to         0.30%       6.010us        85.45%       1.706ms     284.341us       0.000us         0.00%      14.304us       2.384us             6  
-                                         aten::_to_copy         1.18%      23.623us        85.15%       1.700ms     283.340us       0.000us         0.00%      14.304us       2.384us             6  
-                                            aten::copy_         2.42%      48.261us        82.53%       1.648ms     274.613us      12.160us        60.61%      14.304us       2.384us             6  
-                                           aten::conv1d         0.34%       6.690us         5.89%     117.664us      39.221us       0.000us         0.00%       7.903us       2.634us             3  
-                                      aten::convolution         0.46%       9.260us         5.56%     110.974us      36.991us       0.000us         0.00%       7.903us       2.634us             3  
-                                     aten::_convolution         1.15%      23.009us         5.09%     101.714us      33.905us       0.000us         0.00%       7.903us       2.634us             3  
-                                aten::_conv_depthwise2d         1.10%      21.970us         3.15%      62.812us      20.937us       7.903us        39.39%       7.903us       2.634us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.903us        39.39%       7.903us       2.634us             3  
-void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.240us        31.10%       6.240us       2.080us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.920us        29.51%       5.920us       1.973us             3  
-                                Activity Buffer Request        71.49%       1.427ms        71.49%       1.427ms       1.427ms       2.144us        10.69%       2.144us       2.144us             1  
-                                    aten::empty_strided         1.44%      28.740us         1.44%      28.740us       4.790us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         9.68%     193.308us         9.68%     193.308us      21.479us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.85%      16.982us         1.11%      22.224us       2.469us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.45%       8.892us         0.45%       8.892us       0.593us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.47%       9.420us         0.47%       9.420us       3.140us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.51%      10.100us         0.51%      10.100us       3.367us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.31%       6.130us         0.38%       7.650us       2.550us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     325.021us      1622.51%     325.021us     325.021us             1  
+                                            torch_eager        14.95%     114.725us        99.33%     762.279us     762.279us       0.000us         0.00%      22.176us      22.176us             1  
+                                               aten::to         0.78%       5.949us        65.87%     505.530us      84.255us       0.000us         0.00%      14.272us       2.379us             6  
+                                         aten::_to_copy         3.19%      24.509us        65.10%     499.581us      83.264us       0.000us         0.00%      14.272us       2.379us             6  
+                                            aten::copy_         6.59%      50.599us        57.97%     444.890us      74.148us      12.128us        60.54%      14.272us       2.379us             6  
+                                           aten::conv1d         0.79%       6.100us        15.11%     115.973us      38.658us       0.000us         0.00%       7.904us       2.635us             3  
+                                      aten::convolution         1.34%      10.290us        14.32%     109.873us      36.624us       0.000us         0.00%       7.904us       2.635us             3  
+                                     aten::_convolution         2.97%      22.812us        12.98%      99.583us      33.194us       0.000us         0.00%       7.904us       2.635us             3  
+                                aten::_conv_depthwise2d         2.93%      22.501us         8.10%      62.182us      20.727us       7.904us        39.46%       7.904us       2.635us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.904us        39.46%       7.904us       2.635us             3  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.208us        30.99%       6.208us       2.069us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.920us        29.55%       5.920us       1.973us             3  
+                                Activity Buffer Request        28.71%     220.306us        28.71%     220.306us     220.306us       2.144us        10.70%       2.144us       2.144us             1  
+                                    aten::empty_strided         3.93%      30.182us         3.93%      30.182us       5.030us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        25.32%     194.286us        25.32%     194.286us      21.587us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.11%      16.159us         2.76%      21.209us       2.357us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.09%       8.360us         1.09%       8.360us       0.557us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.23%       9.450us         1.23%       9.450us       3.150us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.29%       9.930us         1.29%       9.930us       3.310us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.71%       5.470us         0.87%       6.670us       2.223us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.997ms
-Self CUDA time total: 20.063us
+Self CPU time total: 767.429us
+Self CUDA time total: 20.032us
 
 
 
@@ -4494,29 +4494,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     319.164us       887.36%     319.164us     319.164us             1  
-                                            torch_eager        15.26%     115.785us        99.38%     754.046us     754.046us       0.000us         0.00%      38.560us      38.560us             1  
-                                           aten::conv1d         0.72%       5.471us        14.90%     113.045us      37.682us       0.000us         0.00%      20.097us       6.699us             3  
-                                      aten::convolution         1.25%       9.510us        14.18%     107.574us      35.858us       0.000us         0.00%      20.097us       6.699us             3  
-                                     aten::_convolution         2.85%      21.590us        12.92%      98.064us      32.688us       0.000us         0.00%      20.097us       6.699us             3  
-                                aten::_conv_depthwise2d         2.82%      21.412us         8.06%      61.133us      20.378us      20.097us        55.87%      20.097us       6.699us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      20.097us        55.87%      20.097us       6.699us             3  
-                                               aten::to         0.74%       5.628us        65.55%     497.346us      82.891us       0.000us         0.00%      18.463us       3.077us             6  
-                                         aten::_to_copy         3.02%      22.942us        64.80%     491.718us      81.953us       0.000us         0.00%      18.463us       3.077us             6  
-                                            aten::copy_         6.50%      49.290us        57.91%     439.376us      73.229us      15.871us        44.13%      18.463us       3.077us             6  
-void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.447us        23.48%       8.447us       2.816us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.424us        20.64%       7.424us       2.475us             3  
-                                Activity Buffer Request        28.99%     219.958us        28.99%     219.958us     219.958us       2.592us         7.21%       2.592us       2.592us             1  
-                                    aten::empty_strided         3.87%      29.400us         3.87%      29.400us       4.900us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        25.31%     192.058us        25.31%     192.058us      21.340us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.43%      18.410us         3.09%      23.410us       2.601us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.12%       8.490us         1.12%       8.490us       0.566us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.20%       9.081us         1.20%       9.081us       3.027us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.15%       8.710us         1.15%       8.710us       2.903us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.77%       5.871us         0.96%       7.301us       2.434us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     356.764us       983.15%     356.764us     356.764us             1  
+                                            torch_eager        15.53%     123.844us        99.36%     792.350us     792.350us       0.000us         0.00%      38.944us      38.944us             1  
+                                           aten::conv1d         0.79%       6.320us        15.33%     122.233us      40.744us       0.000us         0.00%      20.320us       6.773us             3  
+                                      aten::convolution         1.24%       9.851us        14.54%     115.913us      38.638us       0.000us         0.00%      20.320us       6.773us             3  
+                                     aten::_convolution         2.89%      23.052us        13.30%     106.062us      35.354us       0.000us         0.00%      20.320us       6.773us             3  
+                                aten::_conv_depthwise2d         2.97%      23.692us         8.39%      66.891us      22.297us      20.320us        56.00%      20.320us       6.773us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      20.320us        56.00%      20.320us       6.773us             3  
+                                               aten::to         0.80%       6.349us        64.76%     516.391us      86.065us       0.000us         0.00%      18.624us       3.104us             6  
+                                         aten::_to_copy         3.21%      25.572us        63.96%     510.042us      85.007us       0.000us         0.00%      18.624us       3.104us             6  
+                                            aten::copy_         6.54%      52.120us        56.52%     450.739us      75.123us      15.968us        44.00%      18.624us       3.104us             6  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.607us        23.72%       8.607us       2.869us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.361us        20.28%       7.361us       2.454us             3  
+                                Activity Buffer Request        27.46%     218.966us        27.46%     218.966us     218.966us       2.656us         7.32%       2.656us       2.656us             1  
+                                    aten::empty_strided         4.23%      33.731us         4.23%      33.731us       5.622us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        25.38%     202.413us        25.38%     202.413us      22.490us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.20%      17.520us         2.88%      22.939us       2.549us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.21%       9.679us         1.21%       9.679us       0.645us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.40%      11.140us         1.40%      11.140us       3.713us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.17%       9.299us         1.17%       9.299us       3.100us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.75%       6.010us         0.93%       7.450us       2.483us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 758.766us
-Self CUDA time total: 35.968us
+Self CPU time total: 797.430us
+Self CUDA time total: 36.288us
 
 
 
@@ -4526,29 +4526,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     318.166us       839.07%     318.166us     318.166us             1  
-                                            torch_eager        15.61%     115.614us        99.23%     735.056us     735.056us       0.000us         0.00%      40.512us      40.512us             1  
-                                           aten::conv1d         0.77%       5.689us        15.23%     112.833us      37.611us       0.000us         0.00%      22.206us       7.402us             3  
-                                      aten::convolution         1.28%       9.450us        14.46%     107.144us      35.715us       0.000us         0.00%      22.206us       7.402us             3  
-                                     aten::_convolution         2.90%      21.450us        13.19%      97.694us      32.565us       0.000us         0.00%      22.206us       7.402us             3  
-                                aten::_conv_depthwise2d         2.86%      21.190us         8.15%      60.352us      20.117us      22.206us        58.56%      22.206us       7.402us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      22.206us        58.56%      22.206us       7.402us             3  
-                                               aten::to         0.76%       5.621us        64.62%     478.657us      79.776us       0.000us         0.00%      18.306us       3.051us             6  
-                                         aten::_to_copy         3.14%      23.241us        63.86%     473.036us      78.839us       0.000us         0.00%      18.306us       3.051us             6  
-                                            aten::copy_         6.66%      49.364us        56.82%     420.865us      70.144us      15.713us        41.44%      18.306us       3.051us             6  
-void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.385us        22.11%       8.385us       2.795us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.328us        19.33%       7.328us       2.443us             3  
-                                Activity Buffer Request        27.11%     200.816us        27.11%     200.816us     200.816us       2.593us         6.84%       2.593us       2.593us             1  
-                                    aten::empty_strided         3.91%      28.930us         3.91%      28.930us       4.822us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        25.94%     192.117us        25.94%     192.117us      21.346us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.42%      17.932us         3.14%      23.222us       2.580us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.19%       8.781us         1.19%       8.781us       0.585us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.25%       9.270us         1.25%       9.270us       3.090us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.14%       8.460us         1.14%       8.460us       2.820us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.85%       6.280us         1.02%       7.591us       2.530us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     332.353us       866.25%     332.353us     332.353us             1  
+                                            torch_eager         6.20%     124.083us        99.73%       1.997ms       1.997ms       0.000us         0.00%      40.959us      40.959us             1  
+                                           aten::conv1d         0.30%       6.071us         5.74%     115.013us      38.338us       0.000us         0.00%      22.592us       7.531us             3  
+                                      aten::convolution         0.48%       9.660us         5.44%     108.942us      36.314us       0.000us         0.00%      22.592us       7.531us             3  
+                                     aten::_convolution         1.09%      21.840us         4.96%      99.282us      33.094us       0.000us         0.00%      22.592us       7.531us             3  
+                                aten::_conv_depthwise2d         1.15%      22.991us         3.11%      62.342us      20.781us      22.592us        58.88%      22.592us       7.531us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      22.592us        58.88%      22.592us       7.531us             3  
+                                               aten::to         0.32%       6.339us        86.44%       1.731ms     288.505us       0.000us         0.00%      18.367us       3.061us             6  
+                                         aten::_to_copy         1.25%      24.980us        86.12%       1.725ms     287.449us       0.000us         0.00%      18.367us       3.061us             6  
+                                            aten::copy_         2.51%      50.252us        83.36%       1.669ms     278.222us      15.775us        41.12%      18.367us       3.061us             6  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.416us        21.94%       8.416us       2.805us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.359us        19.18%       7.359us       2.453us             3  
+                                Activity Buffer Request        72.13%       1.445ms        72.13%       1.445ms       1.445ms       2.592us         6.76%       2.592us       2.592us             1  
+                                    aten::empty_strided         1.52%      30.382us         1.52%      30.382us       5.064us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         9.74%     194.985us         9.74%     194.985us      21.665us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.87%      17.330us         1.13%      22.630us       2.514us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.45%       8.941us         0.45%       8.941us       0.596us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.48%       9.610us         0.48%       9.610us       3.203us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.46%       9.250us         0.46%       9.250us       3.083us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.27%       5.490us         0.34%       6.780us       2.260us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 740.726us
-Self CUDA time total: 37.919us
+Self CPU time total: 2.003ms
+Self CUDA time total: 38.367us
 
 
 
@@ -4558,29 +4558,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     321.366us       502.64%     321.366us     321.366us             1  
-                                            torch_eager        15.27%     113.396us        99.28%     737.126us     737.126us       0.000us         0.00%      68.031us      68.031us             1  
-                                           aten::conv1d         0.76%       5.670us        15.56%     115.503us      38.501us       0.000us         0.00%      41.567us      13.856us             3  
-                                      aten::convolution         1.28%       9.489us        14.79%     109.833us      36.611us       0.000us         0.00%      41.567us      13.856us             3  
-                                     aten::_convolution         3.08%      22.850us        13.52%     100.344us      33.448us       0.000us         0.00%      41.567us      13.856us             3  
-                                aten::_conv_depthwise2d         2.89%      21.483us         8.27%      61.383us      20.461us      41.567us        65.01%      41.567us      13.856us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      41.567us        65.01%      41.567us      13.856us             3  
-                                               aten::to         0.76%       5.660us        64.85%     481.506us      80.251us       0.000us         0.00%      26.464us       4.411us             6  
-                                         aten::_to_copy         3.08%      22.842us        64.09%     475.846us      79.308us       0.000us         0.00%      26.464us       4.411us             6  
-                                            aten::copy_         6.57%      48.752us        57.01%     423.304us      70.551us      22.368us        34.99%      26.464us       4.411us             6  
-void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      11.968us        18.72%      11.968us       3.989us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.400us        16.27%      10.400us       3.467us             3  
-                                Activity Buffer Request        27.27%     202.487us        27.27%     202.487us     202.487us       4.096us         6.41%       4.096us       4.096us             1  
-                                    aten::empty_strided         4.00%      29.700us         4.00%      29.700us       4.950us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        26.15%     194.125us        26.15%     194.125us      21.569us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.30%      17.061us         2.99%      22.191us       2.466us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.19%       8.800us         1.19%       8.800us       0.587us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.25%       9.280us         1.25%       9.280us       3.093us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.15%       8.560us         1.15%       8.560us       2.853us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.77%       5.741us         0.96%       7.151us       2.384us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     328.952us       509.17%     328.952us     328.952us             1  
+                                            torch_eager        15.31%     114.903us        99.32%     745.599us     745.599us       0.000us         0.00%      68.701us      68.701us             1  
+                                           aten::conv1d         0.89%       6.660us        15.50%     116.373us      38.791us       0.000us         0.00%      42.238us      14.079us             3  
+                                      aten::convolution         1.33%       9.952us        14.61%     109.713us      36.571us       0.000us         0.00%      42.238us      14.079us             3  
+                                     aten::_convolution         2.95%      22.149us        13.29%      99.761us      33.254us       0.000us         0.00%      42.238us      14.079us             3  
+                                aten::_conv_depthwise2d         2.94%      22.090us         8.38%      62.891us      20.964us      42.238us        65.38%      42.238us      14.079us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      42.238us        65.38%      42.238us      14.079us             3  
+                                               aten::to         0.80%       6.039us        65.05%     488.341us      81.390us       0.000us         0.00%      26.463us       4.410us             6  
+                                         aten::_to_copy         3.23%      24.281us        64.25%     482.302us      80.384us       0.000us         0.00%      26.463us       4.410us             6  
+                                            aten::copy_         6.57%      49.302us        56.69%     425.561us      70.927us      22.367us        34.62%      26.463us       4.410us             6  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      11.936us        18.48%      11.936us       3.979us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.431us        16.15%      10.431us       3.477us             3  
+                                Activity Buffer Request        26.58%     199.565us        26.58%     199.565us     199.565us       4.096us         6.34%       4.096us       4.096us             1  
+                                    aten::empty_strided         4.32%      32.460us         4.32%      32.460us       5.410us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        26.45%     198.565us        26.45%     198.565us      22.063us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.13%      16.001us         2.81%      21.091us       2.343us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.16%       8.690us         1.16%       8.690us       0.579us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.26%       9.490us         1.26%       9.490us       3.163us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.26%       9.440us         1.26%       9.440us       3.147us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.75%       5.611us         0.93%       6.981us       2.327us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 742.446us
-Self CUDA time total: 63.935us
+Self CPU time total: 750.709us
+Self CUDA time total: 64.605us
 
 
 
@@ -4590,29 +4590,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     326.264us       468.36%     326.264us     326.264us             1  
-                                            torch_eager        14.61%     117.663us        99.38%     800.347us     800.347us       0.000us         0.00%      73.789us      73.789us             1  
-                                           aten::conv1d         0.75%       6.020us        14.38%     115.844us      38.615us       0.000us         0.00%      47.230us      15.743us             3  
-                                      aten::convolution         1.16%       9.351us        13.64%     109.824us      36.608us       0.000us         0.00%      47.230us      15.743us             3  
-                                     aten::_convolution         2.76%      22.250us        12.48%     100.473us      33.491us       0.000us         0.00%      47.230us      15.743us             3  
-                                aten::_conv_depthwise2d         2.71%      21.790us         7.76%      62.461us      20.820us      47.230us        67.80%      47.230us      15.743us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      47.230us        67.80%      47.230us      15.743us             3  
-                                               aten::to         0.71%       5.690us        66.94%     539.059us      89.843us       0.000us         0.00%      26.559us       4.426us             6  
-                                         aten::_to_copy         2.87%      23.082us        66.23%     533.369us      88.895us       0.000us         0.00%      26.559us       4.426us             6  
-                                            aten::copy_         6.12%      49.260us        59.73%     480.976us      80.163us      22.431us        32.20%      26.559us       4.426us             6  
-void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        17.23%      12.000us       4.000us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.431us        14.97%      10.431us       3.477us             3  
-                                Activity Buffer Request        29.99%     241.509us        29.99%     241.509us     241.509us       4.128us         5.93%       4.128us       4.128us             1  
-                                    aten::empty_strided         3.64%      29.311us         3.64%      29.311us       4.885us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        26.37%     212.348us        26.37%     212.348us      23.594us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.22%      17.841us         2.86%      23.041us       2.560us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.09%       8.761us         1.09%       8.761us       0.584us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.16%       9.320us         1.16%       9.320us       3.107us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.14%       9.210us         1.14%       9.210us       3.070us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.77%       6.201us         0.95%       7.621us       2.540us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     328.798us       467.68%     328.798us     328.798us             1  
+                                            torch_eager        14.69%     115.264us        99.37%     779.669us     779.669us       0.000us         0.00%      74.432us      74.432us             1  
+                                           aten::conv1d         0.75%       5.869us        14.89%     116.853us      38.951us       0.000us         0.00%      47.840us      15.947us             3  
+                                      aten::convolution         1.20%       9.412us        14.15%     110.984us      36.995us       0.000us         0.00%      47.840us      15.947us             3  
+                                     aten::_convolution         2.99%      23.451us        12.95%     101.572us      33.857us       0.000us         0.00%      47.840us      15.947us             3  
+                                aten::_conv_depthwise2d         2.71%      21.281us         8.10%      63.532us      21.177us      47.840us        68.05%      47.840us      15.947us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      47.840us        68.05%      47.840us      15.947us             3  
+                                               aten::to         0.74%       5.828us        66.46%     521.411us      86.902us       0.000us         0.00%      26.592us       4.432us             6  
+                                         aten::_to_copy         3.27%      25.622us        65.71%     515.583us      85.931us       0.000us         0.00%      26.592us       4.432us             6  
+                                            aten::copy_         6.42%      50.382us        58.46%     458.651us      76.442us      22.464us        31.95%      26.592us       4.432us             6  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      12.032us        17.11%      12.032us       4.011us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.432us        14.84%      10.432us       3.477us             3  
+                                Activity Buffer Request        29.93%     234.846us        29.93%     234.846us     234.846us       4.128us         5.87%       4.128us       4.128us             1  
+                                    aten::empty_strided         3.99%      31.310us         3.99%      31.310us       5.218us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        24.83%     194.803us        24.83%     194.803us      21.645us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.07%      16.243us         2.72%      21.332us       2.370us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.07%       8.401us         1.07%       8.401us       0.560us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.35%      10.581us         1.35%      10.581us       3.527us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.31%      10.290us         1.31%      10.290us       3.430us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.69%       5.406us         0.84%       6.568us       2.189us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 805.317us
-Self CUDA time total: 69.661us
+Self CPU time total: 784.589us
+Self CUDA time total: 70.304us
 
 
 
@@ -4622,29 +4622,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     372.509us       200.60%     372.509us     372.509us             1  
-                                            torch_eager        16.32%     136.903us        99.36%     833.418us     833.418us       0.000us         0.00%     195.711us     195.711us             1  
-                                           aten::conv1d         0.67%       5.580us        15.45%     129.615us      43.205us       0.000us         0.00%     133.247us      44.416us             3  
-                                      aten::convolution         1.13%       9.510us        14.79%     124.035us      41.345us       0.000us         0.00%     133.247us      44.416us             3  
-                                     aten::_convolution         3.89%      32.633us        13.65%     114.525us      38.175us       0.000us         0.00%     133.247us      44.416us             3  
-                                aten::_conv_depthwise2d         2.50%      20.960us         7.87%      66.022us      22.007us     133.247us        71.76%     133.247us      44.416us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     133.247us        71.76%     133.247us      44.416us             3  
-                                               aten::to         0.72%       6.039us        64.27%     539.099us      89.850us       0.000us         0.00%      62.464us      10.411us             6  
-                                         aten::_to_copy         2.75%      23.094us        63.55%     533.060us      88.843us       0.000us         0.00%      62.464us      10.411us             6  
-                                            aten::copy_         5.97%      50.071us        57.15%     479.385us      79.897us      52.448us        28.24%      62.464us      10.411us             6  
-void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      29.504us        15.89%      29.504us       9.835us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.944us        12.36%      22.944us       7.648us             3  
-                                Activity Buffer Request        30.64%     256.969us        30.64%     256.969us     256.969us      10.016us         5.39%      10.016us      10.016us             1  
-                                    aten::empty_strided         3.65%      30.581us         3.65%      30.581us       5.097us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        23.59%     197.827us        23.59%     197.827us      21.981us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.16%      18.130us         2.81%      23.610us       2.623us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.09%       9.169us         1.09%       9.169us       0.611us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.19%       9.940us         1.19%       9.940us       3.313us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.15%       9.640us         1.15%       9.640us       3.213us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.72%       6.001us         0.89%       7.490us       2.497us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     341.882us       182.91%     341.882us     341.882us             1  
+                                            torch_eager        15.14%     117.185us        99.33%     768.879us     768.879us       0.000us         0.00%     197.117us     197.117us             1  
+                                           aten::conv1d         0.79%       6.110us        14.86%     114.993us      38.331us       0.000us         0.00%     134.270us      44.757us             3  
+                                      aten::convolution         1.22%       9.451us        14.07%     108.883us      36.294us       0.000us         0.00%     134.270us      44.757us             3  
+                                     aten::_convolution         2.87%      22.240us        12.85%      99.432us      33.144us       0.000us         0.00%     134.270us      44.757us             3  
+                                aten::_conv_depthwise2d         2.84%      21.991us         8.04%      62.222us      20.741us     134.270us        71.84%     134.270us      44.757us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     134.270us        71.84%     134.270us      44.757us             3  
+                                               aten::to         0.77%       5.950us        65.77%     509.102us      84.850us       0.000us         0.00%      62.847us      10.474us             6  
+                                         aten::_to_copy         3.29%      25.489us        65.00%     503.152us      83.859us       0.000us         0.00%      62.847us      10.474us             6  
+                                            aten::copy_         6.45%      49.889us        57.58%     445.721us      74.287us      52.639us        28.16%      62.847us      10.474us             6  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      29.728us        15.91%      29.728us       9.909us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.911us        12.26%      22.911us       7.637us             3  
+                                Activity Buffer Request        28.61%     221.416us        28.61%     221.416us     221.416us      10.208us         5.46%      10.208us      10.208us             1  
+                                    aten::empty_strided         4.13%      31.942us         4.13%      31.942us       5.324us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        25.24%     195.386us        25.24%     195.386us      21.710us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.14%      16.602us         2.90%      22.460us       2.496us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.19%       9.247us         1.19%       9.247us       0.616us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.23%       9.500us         1.23%       9.500us       3.167us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.26%       9.761us         1.26%       9.761us       3.254us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.71%       5.470us         0.87%       6.700us       2.233us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 838.778us
-Self CUDA time total: 185.695us
+Self CPU time total: 774.039us
+Self CUDA time total: 186.909us
 
 
 
@@ -4654,29 +4654,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     368.701us       175.32%     368.701us     368.701us             1  
-                                            torch_eager        16.38%     138.724us        99.39%     841.559us     841.559us       0.000us         0.00%     224.383us     224.383us             1  
-                                           aten::conv1d         0.69%       5.870us        14.05%     118.945us      39.648us       0.000us         0.00%     154.015us      51.338us             3  
-                                      aten::convolution         1.19%      10.050us        13.35%     113.075us      37.692us       0.000us         0.00%     154.015us      51.338us             3  
-                                     aten::_convolution         2.68%      22.669us        12.17%     103.025us      34.342us       0.000us         0.00%     154.015us      51.338us             3  
-                                aten::_conv_depthwise2d         2.54%      21.472us         7.66%      64.883us      21.628us     154.015us        73.23%     154.015us      51.338us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     154.015us        73.23%     154.015us      51.338us             3  
-                                               aten::to         0.70%       5.911us        65.49%     554.540us      92.423us       0.000us         0.00%      70.368us      11.728us             6  
-                                         aten::_to_copy         2.70%      22.862us        64.79%     548.629us      91.438us       0.000us         0.00%      70.368us      11.728us             6  
-                                            aten::copy_         5.97%      50.511us        58.49%     495.276us      82.546us      56.288us        26.77%      70.368us      11.728us             6  
-void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      33.248us        15.81%      33.248us      11.083us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      23.040us        10.96%      23.040us       7.680us             3  
-                                Activity Buffer Request        32.21%     272.739us        32.21%     272.739us     272.739us      14.080us         6.70%      14.080us      14.080us             1  
-                                    aten::empty_strided         3.60%      30.491us         3.60%      30.491us       5.082us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        23.06%     195.277us        23.06%     195.277us      21.697us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.26%      19.134us         2.91%      24.623us       2.736us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.07%       9.019us         1.07%       9.019us       0.601us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.15%       9.700us         1.15%       9.700us       3.233us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.24%      10.460us         1.24%      10.460us       3.487us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.68%       5.760us         0.85%       7.180us       2.393us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     349.277us       165.88%     349.277us     349.277us             1  
+                                            torch_eager        15.39%     117.165us        99.36%     756.609us     756.609us       0.000us         0.00%     224.029us     224.029us             1  
+                                           aten::conv1d         0.74%       5.661us        15.33%     116.734us      38.911us       0.000us         0.00%     154.686us      51.562us             3  
+                                      aten::convolution         1.20%       9.150us        14.59%     111.073us      37.024us       0.000us         0.00%     154.686us      51.562us             3  
+                                     aten::_convolution         2.96%      22.532us        13.38%     101.923us      33.974us       0.000us         0.00%     154.686us      51.562us             3  
+                                aten::_conv_depthwise2d         2.86%      21.751us         8.47%      64.492us      21.497us     154.686us        73.47%     154.686us      51.562us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     154.686us        73.47%     154.686us      51.562us             3  
+                                               aten::to         0.84%       6.379us        65.15%     496.150us      82.692us       0.000us         0.00%      69.343us      11.557us             6  
+                                         aten::_to_copy         3.33%      25.371us        64.32%     489.771us      81.628us       0.000us         0.00%      69.343us      11.557us             6  
+                                            aten::copy_         6.44%      49.031us        56.76%     432.240us      72.040us      55.871us        26.53%      69.343us      11.557us             6  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      32.831us        15.59%      32.831us      10.944us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      23.040us        10.94%      23.040us       7.680us             3  
+                                Activity Buffer Request        27.33%     208.145us        27.33%     208.145us     208.145us      13.472us         6.40%      13.472us      13.472us             1  
+                                    aten::empty_strided         4.22%      32.160us         4.22%      32.160us       5.360us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        25.87%     197.025us        25.87%     197.025us      21.892us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.14%      16.329us         2.83%      21.520us       2.391us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.17%       8.932us         1.17%       8.932us       0.595us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.38%      10.500us         1.38%      10.500us       3.500us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.35%      10.280us         1.35%      10.280us       3.427us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.72%       5.468us         0.90%       6.839us       2.280us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 846.749us
-Self CUDA time total: 210.303us
+Self CPU time total: 761.499us
+Self CUDA time total: 210.557us
 
 
 
@@ -4686,29 +4686,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         6.86%     124.525us        53.03%     963.064us     963.064us       0.000us         0.00%       1.524ms       1.524ms             1  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.423ms       100.42%       1.423ms       1.423ms             1  
-                                               aten::to         0.37%       6.781us        38.11%     692.105us     115.351us       0.000us         0.00%     827.798us     137.966us             6  
-                                         aten::_to_copy         1.62%      29.329us        37.74%     685.324us     114.221us       0.000us         0.00%     827.798us     137.966us             6  
-                                            aten::copy_         2.86%      52.014us        24.74%     449.228us      74.871us     721.111us        50.87%     827.798us     137.966us             6  
-                                           aten::conv1d         0.32%       5.800us         6.51%     118.154us      39.385us       0.000us         0.00%     696.313us     232.104us             3  
-                                      aten::convolution         0.55%       9.981us         6.19%     112.354us      37.451us       0.000us         0.00%     696.313us     232.104us             3  
-                                     aten::_convolution         1.25%      22.722us         5.64%     102.373us      34.124us       0.000us         0.00%     696.313us     232.104us             3  
-                                aten::_conv_depthwise2d         1.22%      22.241us         3.54%      64.332us      21.444us     696.313us        49.13%     696.313us     232.104us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     696.313us        49.13%     696.313us     232.104us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     411.194us        29.01%     411.194us     137.065us             3  
-void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     309.917us        21.86%     309.917us     103.306us             3  
-                                Activity Buffer Request        12.02%     218.207us        12.02%     218.207us     218.207us     106.687us         7.53%     106.687us     106.687us             1  
-                                    aten::empty_strided         1.97%      35.692us        11.39%     206.767us      34.461us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        11.11%     201.717us        11.11%     201.717us      22.413us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.96%      17.369us         1.26%      22.889us       2.543us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.51%       9.249us         0.51%       9.249us       0.617us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.50%       9.061us         0.50%       9.061us       3.020us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.57%      10.320us         0.57%      10.320us       3.440us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.33%       5.990us         0.41%       7.360us       2.453us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         6.72%     121.944us        52.58%     953.714us     953.714us       0.000us         0.00%       1.521ms       1.521ms             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.421ms       100.41%       1.421ms       1.421ms             1  
+                                               aten::to         0.35%       6.300us        37.63%     682.555us     113.759us       0.000us         0.00%     824.097us     137.350us             6  
+                                         aten::_to_copy         1.68%      30.549us        37.28%     676.255us     112.709us       0.000us         0.00%     824.097us     137.350us             6  
+                                            aten::copy_         2.98%      53.981us        24.83%     450.422us      75.070us     718.817us        50.79%     824.097us     137.350us             6  
+                                           aten::conv1d         0.35%       6.281us         6.65%     120.554us      40.185us       0.000us         0.00%     696.543us     232.181us             3  
+                                      aten::convolution         0.57%      10.251us         6.30%     114.273us      38.091us       0.000us         0.00%     696.543us     232.181us             3  
+                                     aten::_convolution         1.27%      23.111us         5.73%     104.022us      34.674us       0.000us         0.00%     696.543us     232.181us             3  
+                                aten::_conv_depthwise2d         1.23%      22.359us         3.60%      65.321us      21.774us     696.543us        49.21%     696.543us     232.181us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     696.543us        49.21%     696.543us     232.181us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     409.920us        28.96%     409.920us     136.640us             3  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     308.897us        21.82%     308.897us     102.966us             3  
+                                Activity Buffer Request        11.98%     217.246us        11.98%     217.246us     217.246us     105.280us         7.44%     105.280us     105.280us             1  
+                                    aten::empty_strided         2.17%      39.370us        10.77%     195.284us      32.547us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        11.13%     201.976us        11.13%     201.976us      22.442us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.99%      18.030us         1.31%      23.761us       2.640us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.53%       9.620us         0.53%       9.620us       0.641us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.59%      10.751us         0.59%      10.751us       3.584us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.52%       9.430us         0.52%       9.430us       3.143us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.31%       5.670us         0.39%       7.030us       2.343us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.816ms
-Self CUDA time total: 1.417ms
+Self CPU time total: 1.814ms
+Self CUDA time total: 1.415ms
 
 
 
@@ -4718,33 +4718,33 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         6.33%     114.706us        41.01%     743.286us     743.286us       0.000us         0.00%       1.500ms       1.500ms             1  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.431ms       100.39%       1.431ms       1.431ms             1  
-                                               aten::to         0.32%       5.881us        26.81%     485.936us      80.989us       0.000us         0.00%     762.577us     127.096us             6  
-                                         aten::_to_copy         1.28%      23.109us        26.49%     480.055us      80.009us       0.000us         0.00%     762.577us     127.096us             6  
-                                            aten::copy_         2.74%      49.733us        23.67%     429.056us      71.509us     687.698us        48.25%     762.577us     127.096us             6  
-                                           aten::conv1d         0.31%       5.590us         6.38%     115.623us      38.541us       0.000us         0.00%     737.523us     245.841us             3  
-                                      aten::convolution         0.55%       9.990us         6.07%     110.033us      36.678us       0.000us         0.00%     737.523us     245.841us             3  
-                                     aten::_convolution         1.21%      21.900us         5.52%     100.043us      33.348us       0.000us         0.00%     737.523us     245.841us             3  
-                                aten::_conv_depthwise2d         1.16%      21.072us         3.45%      62.453us      20.818us     737.523us        51.75%     737.523us     245.841us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     737.523us        51.75%     737.523us     245.841us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     400.247us        28.08%     400.247us     133.416us             3  
-void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     287.451us        20.17%     287.451us      95.817us             3  
-                                Activity Buffer Request        11.32%     205.227us        11.32%     205.227us     205.227us      74.879us         5.25%      74.879us      74.879us             1  
-                                    aten::empty_strided         1.54%      27.890us         1.54%      27.890us       4.648us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        10.89%     197.296us        10.89%     197.296us      21.922us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.95%      17.181us         1.23%      22.321us       2.480us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.49%       8.961us         0.49%       8.961us       0.597us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.50%       9.050us         0.50%       9.050us       3.017us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.50%       9.131us         0.50%       9.131us       3.044us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.32%       5.870us         0.41%       7.390us       2.463us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         4.05%     123.714us        65.96%       2.016ms       2.016ms       0.000us         0.00%       1.502ms       1.502ms             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.433ms       100.43%       1.433ms       1.433ms             1  
+                                               aten::to         0.21%       6.507us        56.82%       1.737ms     289.475us       0.000us         0.00%     764.927us     127.488us             6  
+                                         aten::_to_copy         0.85%      25.961us        56.61%       1.730ms     288.391us       0.000us         0.00%     764.927us     127.488us             6  
+                                            aten::copy_         1.76%      53.800us        54.73%       1.673ms     278.832us     689.887us        48.36%     764.927us     127.488us             6  
+                                           aten::conv1d         0.20%       6.220us         4.18%     127.663us      42.554us       0.000us         0.00%     736.735us     245.578us             3  
+                                      aten::convolution         0.34%      10.420us         3.97%     121.443us      40.481us       0.000us         0.00%     736.735us     245.578us             3  
+                                     aten::_convolution         0.75%      22.860us         3.63%     111.023us      37.008us       0.000us         0.00%     736.735us     245.578us             3  
+                                aten::_conv_depthwise2d         0.96%      29.441us         2.37%      72.583us      24.194us     736.735us        51.64%     736.735us     245.578us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     736.735us        51.64%     736.735us     245.578us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     397.471us        27.86%     397.471us     132.490us             3  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     292.416us        20.50%     292.416us      97.472us             3  
+                                Activity Buffer Request        47.26%       1.445ms        47.26%       1.445ms       1.445ms      75.040us         5.26%      75.040us      75.040us             1  
+                                    aten::empty_strided         1.03%      31.391us         1.03%      31.391us       5.232us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         6.45%     197.169us         6.45%     197.169us      21.908us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.57%      17.300us         0.75%      22.850us       2.539us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.30%       9.200us         0.30%       9.200us       0.613us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.32%       9.780us         0.32%       9.780us       3.260us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.36%      10.870us         0.36%      10.870us       3.623us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.19%       5.770us         0.23%       7.180us       2.393us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.812ms
-Self CUDA time total: 1.425ms
+Self CPU time total: 3.057ms
+Self CUDA time total: 1.427ms
 
 
 impl                     wl                  p50(ms)  ok
-torch_eager              cuda_B2_D2048_S128_W2     0.09  True
+torch_eager              cuda_B2_D2048_S128_W2     0.08  True
 torch_eager              cuda_B2_D2048_S128_W4     0.08  True
 torch_eager              cuda_B2_D2048_S2048_W2     0.15  True
 torch_eager              cuda_B2_D2048_S2048_W4     0.16  True
@@ -4752,7 +4752,7 @@ torch_eager              cuda_B2_D2048_S512_W2     0.08  True
 torch_eager              cuda_B2_D2048_S512_W4     0.08  True
 torch_eager              cuda_B2_D64_S128_W2     0.07  True
 torch_eager              cuda_B2_D64_S128_W4     0.09  True
-torch_eager              cuda_B2_D64_S2048_W2     0.09  True
+torch_eager              cuda_B2_D64_S2048_W2     0.08  True
 torch_eager              cuda_B2_D64_S2048_W4     0.08  True
 torch_eager              cuda_B2_D64_S512_W2     0.09  True
 torch_eager              cuda_B2_D64_S512_W4     0.08  True
@@ -4765,10 +4765,16 @@ torch_eager              cuda_B4_D2048_S512_W4     0.10  True
 torch_eager              cuda_B4_D64_S128_W2     0.08  True
 torch_eager              cuda_B4_D64_S128_W4     0.08  True
 torch_eager              cuda_B4_D64_S2048_W2     0.08  True
-torch_eager              cuda_B4_D64_S2048_W4     0.08  True
+torch_eager              cuda_B4_D64_S2048_W4     0.09  True
 torch_eager              cuda_B4_D64_S512_W2     0.08  True
 torch_eager              cuda_B4_D64_S512_W4     0.08  True
 </pre></div>
+<div class="uv-install-logs" id="uv-logs-benchmark">
+<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
+<div class="uv-logs-content" style="display: none;">
+Installed 37 packages in 229ms
+</div>
+</div>
 <div class="cell-artifacts">
 <h4>Artifacts:</h4>
 <a href="artifacts/benchmark/causal_conv1d.jsonl" class="artifact" target="_blank">causal_conv1d.jsonl</a>
diff --git a/causal_conv1d/results/artifacts/combine/latency.svg b/causal_conv1d/results/artifacts/combine/latency.svg
index e336babd9e22036016f034e9655aa303d520c536..07cfbdf7d6b5520fa7d67c8819a8378d9bcd8cb5 100644
--- a/causal_conv1d/results/artifacts/combine/latency.svg
+++ b/causal_conv1d/results/artifacts/combine/latency.svg
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a640783c4d5cb4dc1763b97fa9a3e0cf2d278599a3fc38ba2056846c760ec8fe
-size 35421
+oid sha256:3d92f3a3aa92e11f21958cf1c591a4e709fd40f7b0cccbd544c1e1a77b11bcd2
+size 35429
diff --git a/causal_conv1d/results/combined_results.html b/causal_conv1d/results/combined_results.html
index dcc52b58db96b72ed197292d2ffb66bacd9bf72c..45b22fabef9b9c6a15964465834db2598fd9e481 100644
--- a/causal_conv1d/results/combined_results.html
+++ b/causal_conv1d/results/combined_results.html
@@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
   <rdf:RDF>
    <ns2:Work>
     <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
-    <dc:date>2025-10-28T14:09:26.231666</dc:date>
+    <dc:date>2025-10-29T14:27:58.771179</dc:date>
     <dc:format>image/svg+xml</dc:format>
     <dc:creator>
      <ns2:Agent>
@@ -4216,70 +4216,70 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
    <g id="matplotlib.axis_2">
     <g id="ytick_1">
      <g id="grid-y--2" class="grid grid-y">
-      <path d="M 47.72 375.159294  L 831.034248 375.159294  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 377.079386  L 831.034248 377.079386  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_25">
       <defs>
        <path id="m0fca2865ba" d="M 0 0  L -3.5 0  " style="stroke: #000000; stroke-width: 0.8" />
       </defs>
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="375.159294" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="377.079386" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_25">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="378.958513" transform="rotate(-0 40.72 378.958513)">0.1</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="380.878605" transform="rotate(-0 40.72 380.878605)">0.1</text>
      </g>
     </g>
     <g id="ytick_2">
      <g id="grid-y--3" class="grid grid-y">
-      <path d="M 47.72 292.369752  L 831.034248 292.369752  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 293.552318  L 831.034248 293.552318  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_26">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="292.369752" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="293.552318" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_26">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="296.168971" transform="rotate(-0 40.72 296.168971)">0.2</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="297.351537" transform="rotate(-0 40.72 297.351537)">0.2</text>
      </g>
     </g>
     <g id="ytick_3">
      <g id="grid-y--4" class="grid grid-y">
-      <path d="M 47.72 209.58021  L 831.034248 209.58021  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 210.02525  L 831.034248 210.02525  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_27">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="209.58021" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="210.02525" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_27">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="213.379429" transform="rotate(-0 40.72 213.379429)">0.3</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="213.824469" transform="rotate(-0 40.72 213.824469)">0.3</text>
      </g>
     </g>
     <g id="ytick_4">
      <g id="grid-y--5" class="grid grid-y">
-      <path d="M 47.72 126.790668  L 831.034248 126.790668  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 126.498182  L 831.034248 126.498182  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_28">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="126.790668" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="126.498182" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_28">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="130.589887" transform="rotate(-0 40.72 130.589887)">0.4</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="130.297401" transform="rotate(-0 40.72 130.297401)">0.4</text>
      </g>
     </g>
     <g id="ytick_5">
      <g id="grid-y--6" class="grid grid-y">
-      <path d="M 47.72 44.001126  L 831.034248 44.001126  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 42.971114  L 831.034248 42.971114  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_29">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="44.001126" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="42.971114" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_29">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="47.800344" transform="rotate(-0 40.72 47.800344)">0.5</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="46.770333" transform="rotate(-0 40.72 46.770333)">0.5</text>
      </g>
     </g>
     <g id="label--y" class="ylabel">
@@ -4287,66 +4287,66 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
     </g>
    </g>
    <g id="series--hf-kernels-causal-conv1d" class="series">
-    <path d="M 83.325193 420.186871  L 114.286231 412.917949  L 145.247268 412.868275  L 176.208306 413.042133  L 207.169343 414.110118  L 238.130381 414.110946  L 269.091418 413.580265  L 300.052455 414.938014  L 331.013493 414.656529  L 361.97453 415.161545  L 392.935568 415.575493  L 423.896605 414.035608  L 454.857643 415.195489  L 485.81868 415.20294  L 516.779718 414.706203  L 547.740755 414.043887  L 578.701793 412.479164  L 609.66283 413.795518  L 640.623868 413.141481  L 671.584905 413.489197  L 702.545943 414.151513  L 733.50698 413.886586  L 764.468018 414.582019  L 795.429055 415.368519  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 83.325193 420.186871  L 114.286231 413.746934  L 145.247268 413.019413  L 176.208306 414.649026  L 207.169343 414.665731  L 238.130381 415.985459  L 269.091418 416.252746  L 300.052455 415.525225  L 331.013493 416.703792  L 361.97453 415.692279  L 392.935568 416.269451  L 423.896605 416.168383  L 454.857643 415.52606  L 485.81868 415.952048  L 516.779718 414.072689  L 547.740755 415.399934  L 578.701793 415.43418  L 609.66283 416.402259  L 640.623868 414.841138  L 671.584905 415.024898  L 702.545943 414.990652  L 733.50698 414.974782  L 764.468018 415.61794  L 795.429055 415.759936  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="md7efaf3aec" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #1f77b4" />
     </defs>
     <g clip-path="url(#pb49fc4c8d2)">
      <use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="114.286231" y="412.917949" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="145.247268" y="412.868275" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="176.208306" y="413.042133" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="207.169343" y="414.110118" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="238.130381" y="414.110946" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="269.091418" y="413.580265" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="300.052455" y="414.938014" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="331.013493" y="414.656529" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="361.97453" y="415.161545" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="392.935568" y="415.575493" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="423.896605" y="414.035608" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="454.857643" y="415.195489" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="485.81868" y="415.20294" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="516.779718" y="414.706203" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="547.740755" y="414.043887" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="578.701793" y="412.479164" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="609.66283" y="413.795518" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="640.623868" y="413.141481" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="671.584905" y="413.489197" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="702.545943" y="414.151513" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="733.50698" y="413.886586" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="764.468018" y="414.582019" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="795.429055" y="415.368519" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="114.286231" y="413.746934" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="145.247268" y="413.019413" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="176.208306" y="414.649026" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="207.169343" y="414.665731" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="238.130381" y="415.985459" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="269.091418" y="416.252746" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="300.052455" y="415.525225" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="331.013493" y="416.703792" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="361.97453" y="415.692279" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="392.935568" y="416.269451" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="423.896605" y="416.168383" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="454.857643" y="415.52606" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="485.81868" y="415.952048" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="516.779718" y="414.072689" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="547.740755" y="415.399934" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="578.701793" y="415.43418" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="609.66283" y="416.402259" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="640.623868" y="414.841138" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="671.584905" y="415.024898" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="702.545943" y="414.990652" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="733.50698" y="414.974782" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="764.468018" y="415.61794" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="795.429055" y="415.759936" style="fill: #1f77b4; stroke: #1f77b4" />
     </g>
    </g>
    <g id="series--torch-eager" class="series">
-    <path d="M 83.325193 400.963139  L 114.286231 386.754798  L 145.247268 385.803546  L 176.208306 388.30379  L 207.169343 387.50901  L 238.130381 389.106849  L 269.091418 387.49328  L 300.052455 387.715984  L 331.013493 388.179606  L 361.97453 388.212722  L 392.935568 337.799686  L 423.896605 324.362943  L 454.857643 391.39184  L 485.81868 390.497713  L 516.779718 390.58133  L 547.740755 390.373529  L 578.701793 390.174834  L 609.66283 390.323855  L 640.623868 390.472876  L 671.584905 390.688129  L 702.545943 380.7865  L 733.50698 375.843964  L 764.468018 55.544472  L 795.429055 45.608899  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 83.325193 401.710683  L 114.286231 389.180788  L 145.247268 389.523249  L 176.208306 390.141349  L 207.169343 391.126968  L 238.130381 390.809566  L 269.091418 390.934856  L 300.052455 390.667569  L 331.013493 390.500515  L 361.97453 389.707008  L 392.935568 339.037818  L 423.896605 325.239147  L 454.857643 391.043441  L 485.81868 391.009195  L 516.779718 391.143674  L 547.740755 390.442046  L 578.701793 390.951562  L 609.66283 389.129836  L 640.623868 391.795185  L 671.584905 391.319081  L 702.545943 381.654999  L 733.50698 375.966806  L 764.468018 53.96077  L 795.429055 45.608899  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="m9b8c54d372" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #ff7f0e" />
     </defs>
     <g clip-path="url(#pb49fc4c8d2)">
-     <use ns4:href="#m9b8c54d372" x="83.325193" y="400.963139" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="114.286231" y="386.754798" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="145.247268" y="385.803546" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="176.208306" y="388.30379" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="207.169343" y="387.50901" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="238.130381" y="389.106849" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="269.091418" y="387.49328" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="300.052455" y="387.715984" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="331.013493" y="388.179606" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="361.97453" y="388.212722" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="392.935568" y="337.799686" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="423.896605" y="324.362943" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="454.857643" y="391.39184" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="485.81868" y="390.497713" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="516.779718" y="390.58133" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="547.740755" y="390.373529" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="578.701793" y="390.174834" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="609.66283" y="390.323855" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="640.623868" y="390.472876" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="671.584905" y="390.688129" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="702.545943" y="380.7865" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="733.50698" y="375.843964" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="764.468018" y="55.544472" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="83.325193" y="401.710683" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="114.286231" y="389.180788" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="145.247268" y="389.523249" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="176.208306" y="390.141349" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="207.169343" y="391.126968" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="238.130381" y="390.809566" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="269.091418" y="390.934856" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="300.052455" y="390.667569" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="331.013493" y="390.500515" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="361.97453" y="389.707008" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="392.935568" y="339.037818" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="423.896605" y="325.239147" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="454.857643" y="391.043441" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="485.81868" y="391.009195" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="516.779718" y="391.143674" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="547.740755" y="390.442046" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="578.701793" y="390.951562" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="609.66283" y="389.129836" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="640.623868" y="391.795185" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="671.584905" y="391.319081" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="702.545943" y="381.654999" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="733.50698" y="375.966806" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="764.468018" y="53.96077" style="fill: #ff7f0e; stroke: #ff7f0e" />
      <use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
     </g>
    </g>
@@ -4405,7 +4405,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
 <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
 </span> | 
-Cell: combine | 4.38s
+Cell: combine | 4.32s
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
 <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -4499,11 +4499,11 @@ hf_kernels_causal_conv1d cuda_B2_D2048_S2048_W4     0.05  True
 hf_kernels_causal_conv1d cuda_B2_D2048_S512_W2     0.05  True
 hf_kernels_causal_conv1d cuda_B2_D2048_S512_W4     0.05  True
 hf_kernels_causal_conv1d cuda_B2_D64_S128_W2     0.05  True
-hf_kernels_causal_conv1d cuda_B2_D64_S128_W4     0.05  True
-hf_kernels_causal_conv1d cuda_B2_D64_S2048_W2     0.05  True
+hf_kernels_causal_conv1d cuda_B2_D64_S128_W4     0.06  True
+hf_kernels_causal_conv1d cuda_B2_D64_S2048_W2     0.06  True
 hf_kernels_causal_conv1d cuda_B2_D64_S2048_W4     0.05  True
-hf_kernels_causal_conv1d cuda_B2_D64_S512_W2     0.05  True
-hf_kernels_causal_conv1d cuda_B2_D64_S512_W4     0.05  True
+hf_kernels_causal_conv1d cuda_B2_D64_S512_W2     0.06  True
+hf_kernels_causal_conv1d cuda_B2_D64_S512_W4     0.06  True
 hf_kernels_causal_conv1d cuda_B4_D2048_S128_W2     0.05  True
 hf_kernels_causal_conv1d cuda_B4_D2048_S128_W4     0.05  True
 hf_kernels_causal_conv1d cuda_B4_D2048_S2048_W2     0.05  True
@@ -4514,9 +4514,9 @@ hf_kernels_causal_conv1d cuda_B4_D64_S128_W2     0.05  True
 hf_kernels_causal_conv1d cuda_B4_D64_S128_W4     0.05  True
 hf_kernels_causal_conv1d cuda_B4_D64_S2048_W2     0.05  True
 hf_kernels_causal_conv1d cuda_B4_D64_S2048_W4     0.05  True
-hf_kernels_causal_conv1d cuda_B4_D64_S512_W2     0.05  True
+hf_kernels_causal_conv1d cuda_B4_D64_S512_W2     0.06  True
 hf_kernels_causal_conv1d cuda_B4_D64_S512_W4     0.05  True
-torch_eager              cuda_B2_D2048_S128_W2     0.09  True
+torch_eager              cuda_B2_D2048_S128_W2     0.08  True
 torch_eager              cuda_B2_D2048_S128_W4     0.08  True
 torch_eager              cuda_B2_D2048_S2048_W2     0.15  True
 torch_eager              cuda_B2_D2048_S2048_W4     0.16  True
@@ -4524,7 +4524,7 @@ torch_eager              cuda_B2_D2048_S512_W2     0.08  True
 torch_eager              cuda_B2_D2048_S512_W4     0.08  True
 torch_eager              cuda_B2_D64_S128_W2     0.07  True
 torch_eager              cuda_B2_D64_S128_W4     0.09  True
-torch_eager              cuda_B2_D64_S2048_W2     0.09  True
+torch_eager              cuda_B2_D64_S2048_W2     0.08  True
 torch_eager              cuda_B2_D64_S2048_W4     0.08  True
 torch_eager              cuda_B2_D64_S512_W2     0.09  True
 torch_eager              cuda_B2_D64_S512_W4     0.08  True
@@ -4537,7 +4537,7 @@ torch_eager              cuda_B4_D2048_S512_W4     0.10  True
 torch_eager              cuda_B4_D64_S128_W2     0.08  True
 torch_eager              cuda_B4_D64_S128_W4     0.08  True
 torch_eager              cuda_B4_D64_S2048_W2     0.08  True
-torch_eager              cuda_B4_D64_S2048_W4     0.08  True
+torch_eager              cuda_B4_D64_S2048_W4     0.09  True
 torch_eager              cuda_B4_D64_S512_W2     0.08  True
 torch_eager              cuda_B4_D64_S512_W4     0.08  True
 
@@ -4559,7 +4559,7 @@ Implementations included:
 <div class="uv-install-logs" id="uv-logs-combine">
 <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 <div class="uv-logs-content" style="display: none;">
-Installed 37 packages in 221ms
+Installed 37 packages in 214ms
 </div>
 </div>
 <div class="cell-artifacts">
@@ -4572,7 +4572,7 @@ Installed 37 packages in 221ms
   <rdf:RDF>
    <ns2:Work>
     <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
-    <dc:date>2025-10-28T14:09:26.231666</dc:date>
+    <dc:date>2025-10-29T14:27:58.771179</dc:date>
     <dc:format>image/svg+xml</dc:format>
     <dc:creator>
      <ns2:Agent>
@@ -4916,70 +4916,70 @@ Installed 37 packages in 221ms
    <g id="matplotlib.axis_2">
     <g id="ytick_1">
      <g id="grid-y--2" class="grid grid-y">
-      <path d="M 47.72 375.159294  L 831.034248 375.159294  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 377.079386  L 831.034248 377.079386  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_25">
       <defs>
        <path id="m0fca2865ba" d="M 0 0  L -3.5 0  " style="stroke: #000000; stroke-width: 0.8" />
       </defs>
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="375.159294" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="377.079386" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_25">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="378.958513" transform="rotate(-0 40.72 378.958513)">0.1</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="380.878605" transform="rotate(-0 40.72 380.878605)">0.1</text>
      </g>
     </g>
     <g id="ytick_2">
      <g id="grid-y--3" class="grid grid-y">
-      <path d="M 47.72 292.369752  L 831.034248 292.369752  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 293.552318  L 831.034248 293.552318  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_26">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="292.369752" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="293.552318" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_26">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="296.168971" transform="rotate(-0 40.72 296.168971)">0.2</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="297.351537" transform="rotate(-0 40.72 297.351537)">0.2</text>
      </g>
     </g>
     <g id="ytick_3">
      <g id="grid-y--4" class="grid grid-y">
-      <path d="M 47.72 209.58021  L 831.034248 209.58021  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 210.02525  L 831.034248 210.02525  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_27">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="209.58021" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="210.02525" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_27">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="213.379429" transform="rotate(-0 40.72 213.379429)">0.3</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="213.824469" transform="rotate(-0 40.72 213.824469)">0.3</text>
      </g>
     </g>
     <g id="ytick_4">
      <g id="grid-y--5" class="grid grid-y">
-      <path d="M 47.72 126.790668  L 831.034248 126.790668  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 126.498182  L 831.034248 126.498182  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_28">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="126.790668" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="126.498182" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_28">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="130.589887" transform="rotate(-0 40.72 130.589887)">0.4</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="130.297401" transform="rotate(-0 40.72 130.297401)">0.4</text>
      </g>
     </g>
     <g id="ytick_5">
      <g id="grid-y--6" class="grid grid-y">
-      <path d="M 47.72 44.001126  L 831.034248 44.001126  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 42.971114  L 831.034248 42.971114  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_29">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="44.001126" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="42.971114" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_29">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="47.800344" transform="rotate(-0 40.72 47.800344)">0.5</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="46.770333" transform="rotate(-0 40.72 46.770333)">0.5</text>
      </g>
     </g>
     <g id="label--y" class="ylabel">
@@ -4987,66 +4987,66 @@ Installed 37 packages in 221ms
     </g>
    </g>
    <g id="series--hf-kernels-causal-conv1d" class="series">
-    <path d="M 83.325193 420.186871  L 114.286231 412.917949  L 145.247268 412.868275  L 176.208306 413.042133  L 207.169343 414.110118  L 238.130381 414.110946  L 269.091418 413.580265  L 300.052455 414.938014  L 331.013493 414.656529  L 361.97453 415.161545  L 392.935568 415.575493  L 423.896605 414.035608  L 454.857643 415.195489  L 485.81868 415.20294  L 516.779718 414.706203  L 547.740755 414.043887  L 578.701793 412.479164  L 609.66283 413.795518  L 640.623868 413.141481  L 671.584905 413.489197  L 702.545943 414.151513  L 733.50698 413.886586  L 764.468018 414.582019  L 795.429055 415.368519  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 83.325193 420.186871  L 114.286231 413.746934  L 145.247268 413.019413  L 176.208306 414.649026  L 207.169343 414.665731  L 238.130381 415.985459  L 269.091418 416.252746  L 300.052455 415.525225  L 331.013493 416.703792  L 361.97453 415.692279  L 392.935568 416.269451  L 423.896605 416.168383  L 454.857643 415.52606  L 485.81868 415.952048  L 516.779718 414.072689  L 547.740755 415.399934  L 578.701793 415.43418  L 609.66283 416.402259  L 640.623868 414.841138  L 671.584905 415.024898  L 702.545943 414.990652  L 733.50698 414.974782  L 764.468018 415.61794  L 795.429055 415.759936  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="md7efaf3aec" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #1f77b4" />
     </defs>
     <g clip-path="url(#pb49fc4c8d2)">
      <use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="114.286231" y="412.917949" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="145.247268" y="412.868275" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="176.208306" y="413.042133" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="207.169343" y="414.110118" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="238.130381" y="414.110946" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="269.091418" y="413.580265" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="300.052455" y="414.938014" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="331.013493" y="414.656529" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="361.97453" y="415.161545" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="392.935568" y="415.575493" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="423.896605" y="414.035608" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="454.857643" y="415.195489" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="485.81868" y="415.20294" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="516.779718" y="414.706203" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="547.740755" y="414.043887" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="578.701793" y="412.479164" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="609.66283" y="413.795518" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="640.623868" y="413.141481" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="671.584905" y="413.489197" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="702.545943" y="414.151513" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="733.50698" y="413.886586" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="764.468018" y="414.582019" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="795.429055" y="415.368519" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="114.286231" y="413.746934" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="145.247268" y="413.019413" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="176.208306" y="414.649026" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="207.169343" y="414.665731" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="238.130381" y="415.985459" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="269.091418" y="416.252746" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="300.052455" y="415.525225" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="331.013493" y="416.703792" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="361.97453" y="415.692279" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="392.935568" y="416.269451" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="423.896605" y="416.168383" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="454.857643" y="415.52606" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="485.81868" y="415.952048" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="516.779718" y="414.072689" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="547.740755" y="415.399934" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="578.701793" y="415.43418" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="609.66283" y="416.402259" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="640.623868" y="414.841138" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="671.584905" y="415.024898" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="702.545943" y="414.990652" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="733.50698" y="414.974782" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="764.468018" y="415.61794" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="795.429055" y="415.759936" style="fill: #1f77b4; stroke: #1f77b4" />
     </g>
    </g>
    <g id="series--torch-eager" class="series">
-    <path d="M 83.325193 400.963139  L 114.286231 386.754798  L 145.247268 385.803546  L 176.208306 388.30379  L 207.169343 387.50901  L 238.130381 389.106849  L 269.091418 387.49328  L 300.052455 387.715984  L 331.013493 388.179606  L 361.97453 388.212722  L 392.935568 337.799686  L 423.896605 324.362943  L 454.857643 391.39184  L 485.81868 390.497713  L 516.779718 390.58133  L 547.740755 390.373529  L 578.701793 390.174834  L 609.66283 390.323855  L 640.623868 390.472876  L 671.584905 390.688129  L 702.545943 380.7865  L 733.50698 375.843964  L 764.468018 55.544472  L 795.429055 45.608899  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 83.325193 401.710683  L 114.286231 389.180788  L 145.247268 389.523249  L 176.208306 390.141349  L 207.169343 391.126968  L 238.130381 390.809566  L 269.091418 390.934856  L 300.052455 390.667569  L 331.013493 390.500515  L 361.97453 389.707008  L 392.935568 339.037818  L 423.896605 325.239147  L 454.857643 391.043441  L 485.81868 391.009195  L 516.779718 391.143674  L 547.740755 390.442046  L 578.701793 390.951562  L 609.66283 389.129836  L 640.623868 391.795185  L 671.584905 391.319081  L 702.545943 381.654999  L 733.50698 375.966806  L 764.468018 53.96077  L 795.429055 45.608899  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="m9b8c54d372" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #ff7f0e" />
     </defs>
     <g clip-path="url(#pb49fc4c8d2)">
-     <use ns4:href="#m9b8c54d372" x="83.325193" y="400.963139" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="114.286231" y="386.754798" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="145.247268" y="385.803546" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="176.208306" y="388.30379" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="207.169343" y="387.50901" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="238.130381" y="389.106849" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="269.091418" y="387.49328" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="300.052455" y="387.715984" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="331.013493" y="388.179606" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="361.97453" y="388.212722" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="392.935568" y="337.799686" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="423.896605" y="324.362943" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="454.857643" y="391.39184" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="485.81868" y="390.497713" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="516.779718" y="390.58133" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="547.740755" y="390.373529" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="578.701793" y="390.174834" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="609.66283" y="390.323855" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="640.623868" y="390.472876" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="671.584905" y="390.688129" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="702.545943" y="380.7865" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="733.50698" y="375.843964" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="764.468018" y="55.544472" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="83.325193" y="401.710683" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="114.286231" y="389.180788" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="145.247268" y="389.523249" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="176.208306" y="390.141349" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="207.169343" y="391.126968" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="238.130381" y="390.809566" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="269.091418" y="390.934856" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="300.052455" y="390.667569" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="331.013493" y="390.500515" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="361.97453" y="389.707008" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="392.935568" y="339.037818" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="423.896605" y="325.239147" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="454.857643" y="391.043441" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="485.81868" y="391.009195" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="516.779718" y="391.143674" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="547.740755" y="390.442046" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="578.701793" y="390.951562" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="609.66283" y="389.129836" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="640.623868" y="391.795185" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="671.584905" y="391.319081" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="702.545943" y="381.654999" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="733.50698" y="375.966806" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="764.468018" y="53.96077" style="fill: #ff7f0e; stroke: #ff7f0e" />
      <use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
     </g>
    </g>
diff --git a/flash_attn/impls/artifacts/benchmark/attention.jsonl b/flash_attn/impls/artifacts/benchmark/attention.jsonl
index c187be19ed971576faca83871bac5aeb9c24284a..dfaf0c99c533e861b9b0cf0a7d640e38745db1c9 100644
--- a/flash_attn/impls/artifacts/benchmark/attention.jsonl
+++ b/flash_attn/impls/artifacts/benchmark/attention.jsonl
@@ -1,6 +1,6 @@
-{"ts": "2025-10-28T14:08:53Z", "run": "5a355db38c804430a5f59b9c06c46f52", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.9106109999947876, "p50": 0.9171110000352201, "p90": 0.9204320000435473, "mean": 0.9179216000347878, "iqr": 0.005419999979494605, "raw_times": [0.9171110000352201, 0.9150120000640527, 0.9106109999947876, 0.9204320000435473, 0.9264420000363316], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.9176309999929799, "peak_bytes": 295567360, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-28T14:08:53Z", "run": "5a355db38c804430a5f59b9c06c46f52", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.9576329999845257, "p50": 0.960063999968952, "p90": 0.9623629999850891, "mean": 0.9611931999643275, "iqr": 0.0033900000744324643, "raw_times": [0.9589729999106567, 0.9576329999845257, 0.960063999968952, 0.9669329999724141, 0.9623629999850891], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.9673530000782193, "peak_bytes": 304742400, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-28T14:08:53Z", "run": "5a355db38c804430a5f59b9c06c46f52", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0019650000003821, "p50": 1.0193159999971613, "p90": 1.0211459999709405, "mean": 1.015251600006195, "iqr": 0.01198099994326185, "raw_times": [1.0019650000003821, 1.0091650000276786, 1.024666000034813, 1.0193159999971613, 1.0211459999709405], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.009233999980097, "peak_bytes": 307494912, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-28T14:08:53Z", "run": "5a355db38c804430a5f59b9c06c46f52", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0176959999625979, "p50": 1.0199849999708022, "p90": 1.025695000066662, "mean": 1.0218714000075124, "iqr": 0.006820000066909415, "raw_times": [1.0271060000377474, 1.0176959999625979, 1.0188749999997526, 1.0199849999708022, 1.025695000066662], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.027405000058934, "peak_bytes": 311296000, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-28T14:08:53Z", "run": "5a355db38c804430a5f59b9c06c46f52", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.1665810000067722, "p50": 1.1845809999613266, "p90": 1.185440999961429, "mean": 1.1787729999923613, "iqr": 0.01419000000169035, "raw_times": [1.1712509999597387, 1.1665810000067722, 1.18601100007254, 1.1845809999613266, 1.185440999961429], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.1787800000320203, "peak_bytes": 315621376, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-28T14:08:53Z", "run": "5a355db38c804430a5f59b9c06c46f52", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.1722899999995207, "p50": 1.1832310000272628, "p90": 1.1854509999693619, "mean": 1.181276799979969, "iqr": 0.008630000024822948, "raw_times": [1.1885909999591604, 1.1854509999693619, 1.176820999944539, 1.1832310000272628, 1.1722899999995207], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.1782799999764393, "peak_bytes": 319946752, "ok": true, "absmax": 0.125, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.125, "mae": 0.000362396240234375, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:40Z", "run": "f6f1662cc0064e52be2f08072a7254af", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.98791400000664, "p50": 0.995113999977093, "p90": 1.0003840000081254, "mean": 0.9967803999984426, "iqr": 0.00634899998885885, "raw_times": [0.98791400000664, 0.9940350000192666, 1.006454999981088, 1.0003840000081254, 0.995113999977093], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.0150049999992916, "peak_bytes": 295567360, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:41Z", "run": "f6f1662cc0064e52be2f08072a7254af", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0233649999804584, "p50": 1.0321449999537435, "p90": 1.0348449999924014, "mean": 1.032277399974646, "iqr": 0.009739000006447895, "raw_times": [1.045925999960673, 1.0251059999859535, 1.0321449999537435, 1.0233649999804584, 1.0348449999924014], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.035865999995167, "peak_bytes": 304742400, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:41Z", "run": "f6f1662cc0064e52be2f08072a7254af", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0737370000128976, "p50": 1.084086999981082, "p90": 1.088675999994848, "mean": 1.0826705999988917, "iqr": 0.0148400000057336, "raw_times": [1.0738359999891145, 1.0930170000165162, 1.084086999981082, 1.0737370000128976, 1.088675999994848], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.0889670000437945, "peak_bytes": 307494912, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:41Z", "run": "f6f1662cc0064e52be2f08072a7254af", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0887770000067576, "p50": 1.0916359999555425, "p90": 1.096396999969329, "mean": 1.0932085999797891, "iqr": 0.005600999998023326, "raw_times": [1.0887770000067576, 1.0907959999713057, 1.0916359999555425, 1.0984369999960109, 1.096396999969329], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.1031370000296192, "peak_bytes": 311296000, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:41Z", "run": "f6f1662cc0064e52be2f08072a7254af", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.2465009999687027, "p50": 1.2523310000460697, "p90": 1.2523909999799798, "mean": 1.2538410000047406, "iqr": 0.005290999979479238, "raw_times": [1.2523310000460697, 1.2523909999799798, 1.2471000000005006, 1.2465009999687027, 1.2708820000284504], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2551809999763464, "peak_bytes": 315621376, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:41Z", "run": "f6f1662cc0064e52be2f08072a7254af", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.2355700000057368, "p50": 1.241141000036805, "p90": 1.2576200000466997, "mean": 1.2477664000130062, "iqr": 0.02047000003813082, "raw_times": [1.2355700000057368, 1.241141000036805, 1.237150000008569, 1.2576200000466997, 1.2673509999672206], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2579809999806457, "peak_bytes": 319946752, "ok": true, "absmax": 0.125, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.125, "mae": 0.000362396240234375, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null}
diff --git a/flash_attn/impls/cells/benchmark.py b/flash_attn/impls/cells/benchmark.py
index 15f02e2ed444e10eba9708f3f69247414b6c962b..04ae262009c3d6e33aaa3e392d28c903f24c287c 100644
--- a/flash_attn/impls/cells/benchmark.py
+++ b/flash_attn/impls/cells/benchmark.py
@@ -4,7 +4,7 @@
 #     "numpy",
 #     "torch==2.8.0",
 #     "kernels-benchmark-tools",
-#     "kernels",
+#     "xformers",
 # ]
 #
 # [tool.uv.sources]
@@ -13,19 +13,18 @@
 import torch
 import sys
 from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
-from kernels import get_kernel
+import xformers.ops as xops
 
-# Load the flash attention 3 kernel
-hf_kernels_flash_attn3 = get_kernel("kernels-community/flash-attn3")
 
-
-def hf_flash_attention3(query, key, value):
-    return hf_kernels_flash_attn3.flash_attn_func(query, key, value, causal=False)[0]
+def xformers_attention(q, k, v):
+    """xFormers memory efficient attention"""
+    # xFormers expects [batch, seq_len, heads, head_dim]
+    return xops.memory_efficient_attention(q, k, v)
 
 
 run_benchmark(
     kernel_type=KernelTypeEnum.ATTENTION,
-    impl_name="hf_kernels_flash_attn3",
-    impl_tags={"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"},
-    impl_func=hf_flash_attention3,
+    impl_name="xformers_meff",
+    impl_tags={"family": "xformers", "backend": "memory_efficient", "compile": "none"},
+    impl_func=xformers_attention,
 )
\ No newline at end of file
diff --git a/flash_attn/impls/flash_attention.html b/flash_attn/impls/flash_attention.html
index b4834aa56614f91a384d067a2ab29e14d8abc5f4..a6e50f4eba46389d1f17c35d67cbb770dc3d8952 100644
--- a/flash_attn/impls/flash_attention.html
+++ b/flash_attn/impls/flash_attention.html
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
 <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> | 
-Cell: nv | 0.26s
+Cell: nv | 0.28s
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
 <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3888,7 +3888,7 @@ Cell: nv | 0.26s
 </div>
 </div>
 <div id="output-nv" class="cell-output">
-<div class="cell-stdout"><pre class="stdout-text">Tue Oct 28 14:08:39 2025       
+<div class="cell-stdout"><pre class="stdout-text">Wed Oct 29 14:25:53 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -3897,7 +3897,7 @@ Cell: nv | 0.26s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   32C    P0            153W /  350W |       0MiB /  46068MiB |     26%      Default |
+| N/A   27C    P8             21W /  350W |       0MiB /  46068MiB |      0%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3919,9 +3919,9 @@ Cell: nv | 0.26s
 <span class="collapse-indicators">
 <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span> 
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
- <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
+ <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
 </span> | 
-Cell: benchmark | 3.83s
+Cell: benchmark | 32.77s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3972,29 +3972,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L128_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.585ms       101.47%       3.585ms       3.585ms             1  
-                                         torch_flash_ma         6.34%     327.656us        45.53%       2.352ms       2.352ms       0.000us         0.00%       3.573ms       3.573ms             1  
-                     aten::scaled_dot_product_attention         0.82%      42.312us         4.12%     213.057us      71.019us       0.000us         0.00%       2.820ms     940.062us             3  
-              aten::_scaled_dot_product_flash_attention         0.51%      26.321us         3.31%     170.745us      56.915us       0.000us         0.00%       2.820ms     940.062us             3  
-                         aten::_flash_attention_forward         0.73%      37.527us         2.40%     124.015us      41.338us       2.820ms        79.83%       2.820ms     940.062us             3  
-void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       2.820ms        79.83%       2.820ms     940.062us             3  
-                                       aten::contiguous         0.27%      14.121us        33.79%       1.745ms     145.446us       0.000us         0.00%     752.928us      62.744us            12  
-                                            aten::clone         0.72%      37.329us        33.52%       1.731ms     144.269us       0.000us         0.00%     752.928us      62.744us            12  
-                                            aten::copy_         1.68%      87.013us        31.25%       1.614ms     134.513us     712.672us        20.17%     752.928us      62.744us            12  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     712.672us        20.17%     712.672us      59.389us            12  
-                                Activity Buffer Request        27.64%       1.428ms        27.64%       1.428ms       1.428ms      40.256us         1.14%      40.256us      40.256us             1  
-                                        aten::transpose         1.24%      64.087us         1.67%      86.009us       3.584us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.42%      21.922us         0.42%      21.922us       0.913us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.48%      24.711us         1.99%     102.775us       6.852us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.74%      89.843us         1.74%      89.843us       3.743us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         2.38%     122.771us         2.38%     122.771us       8.185us       0.000us         0.00%       0.000us       0.000us            15  
-                                    aten::empty_strided         0.34%      17.310us         0.34%      17.310us       5.770us       0.000us         0.00%       0.000us       0.000us             3  
-                                 cudaDeviceGetAttribute         0.04%       2.229us         0.04%       2.229us       0.372us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.17%       8.900us         0.17%       8.900us       2.967us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        54.47%       2.814ms        54.47%       2.814ms       2.814ms       0.000us         0.00%       0.000us       0.000us             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.644ms       102.02%       3.644ms       3.644ms             1  
+                                         torch_flash_ma         6.80%     356.846us        47.04%       2.468ms       2.468ms       0.000us         0.00%       3.612ms       3.612ms             1  
+                     aten::scaled_dot_product_attention         0.82%      43.042us         4.47%     234.776us      78.259us       0.000us         0.00%       2.857ms     952.201us             3  
+              aten::_scaled_dot_product_flash_attention         0.56%      29.330us         3.65%     191.734us      63.911us       0.000us         0.00%       2.857ms     952.201us             3  
+                         aten::_flash_attention_forward         0.75%      39.581us         2.59%     135.674us      45.225us       2.857ms        79.97%       2.857ms     952.201us             3  
+void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       2.857ms        79.97%       2.857ms     952.201us             3  
+                                       aten::contiguous         0.27%      14.180us        34.32%       1.801ms     150.051us       0.000us         0.00%     755.680us      62.973us            12  
+                                            aten::clone         0.74%      38.791us        34.04%       1.786ms     148.870us       0.000us         0.00%     755.680us      62.973us            12  
+                                            aten::copy_         1.85%      97.030us        31.43%       1.649ms     137.429us     715.456us        20.03%     755.680us      62.973us            12  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     715.456us        20.03%     715.456us      59.621us            12  
+                                Activity Buffer Request        27.38%       1.437ms        27.38%       1.437ms       1.437ms      40.224us         1.13%      40.224us      40.224us             1  
+                                        aten::transpose         1.47%      77.273us         1.96%     102.714us       4.280us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.48%      25.441us         0.48%      25.441us       1.060us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.70%      36.821us         2.35%     123.326us       8.222us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.93%     101.493us         1.93%     101.493us       4.229us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         2.70%     141.775us         2.70%     141.775us       9.452us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.35%      18.402us         0.35%      18.402us       6.134us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.05%       2.540us         0.05%       2.540us       0.423us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.17%       8.890us         0.17%       8.890us       2.963us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        52.96%       2.779ms        52.96%       2.779ms       2.779ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 5.165ms
-Self CUDA time total: 3.533ms
+Self CPU time total: 5.247ms
+Self CUDA time total: 3.572ms
 
 
 
@@ -4004,29 +4004,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L256_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                         torch_flash_ma         4.84%     255.079us        41.49%       2.188ms       2.188ms       0.000us         0.00%       3.787ms       3.787ms             1  
-                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.743ms       100.29%       3.743ms       3.743ms             1  
-                     aten::scaled_dot_product_attention         0.47%      24.640us         3.42%     180.356us      60.119us       0.000us         0.00%       2.967ms     989.106us             3  
-              aten::_scaled_dot_product_flash_attention         0.36%      19.241us         2.95%     155.716us      51.905us       0.000us         0.00%       2.967ms     989.106us             3  
-                         aten::_flash_attention_forward         0.73%      38.683us         2.19%     115.525us      38.508us       2.967ms        79.51%       2.967ms     989.106us             3  
-void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       2.967ms        79.51%       2.967ms     989.106us             3  
-                                       aten::contiguous         0.17%       8.802us        32.41%       1.709ms     142.425us       0.000us         0.00%     819.868us      68.322us            12  
-                                            aten::clone         0.52%      27.349us        32.24%       1.700ms     141.692us       0.000us         0.00%     819.868us      68.322us            12  
-                                            aten::copy_         1.56%      82.061us        30.60%       1.614ms     134.473us     764.892us        20.49%     819.868us      68.322us            12  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     764.892us        20.49%     764.892us      63.741us            12  
-                                Activity Buffer Request        27.50%       1.450ms        27.50%       1.450ms       1.450ms      54.976us         1.47%      54.976us      54.976us             1  
-                                        aten::transpose         0.91%      47.959us         1.22%      64.512us       2.688us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.31%      16.553us         0.31%      16.553us       0.690us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.39%      20.732us         1.52%      80.304us       5.354us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.38%      72.972us         1.38%      72.972us       3.040us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         1.96%     103.146us         1.96%     103.146us       6.876us       0.000us         0.00%       0.000us       0.000us            15  
-                                    aten::empty_strided         0.28%      14.880us         0.28%      14.880us       4.960us       0.000us         0.00%       0.000us       0.000us             3  
-                                 cudaDeviceGetAttribute         0.03%       1.800us         0.03%       1.800us       0.300us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.07%       3.830us         0.07%       3.830us       1.277us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        58.51%       3.085ms        58.51%       3.085ms       3.085ms       0.000us         0.00%       0.000us       0.000us             1  
+                                         torch_flash_ma         4.70%     246.528us        41.73%       2.189ms       2.189ms       0.000us         0.00%       3.817ms       3.817ms             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.772ms       100.28%       3.772ms       3.772ms             1  
+                     aten::scaled_dot_product_attention         0.51%      26.610us         3.43%     180.143us      60.048us       0.000us         0.00%       2.999ms     999.573us             3  
+              aten::_scaled_dot_product_flash_attention         0.37%      19.600us         2.93%     153.533us      51.178us       0.000us         0.00%       2.999ms     999.573us             3  
+                         aten::_flash_attention_forward         0.63%      32.980us         2.12%     111.443us      37.148us       2.999ms        79.71%       2.999ms     999.573us             3  
+void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       2.999ms        79.71%       2.999ms     999.573us             3  
+                                       aten::contiguous         0.19%      10.030us        32.68%       1.715ms     142.893us       0.000us         0.00%     818.210us      68.184us            12  
+                                            aten::clone         0.55%      29.002us        32.49%       1.705ms     142.057us       0.000us         0.00%     818.210us      68.184us            12  
+                                            aten::copy_         2.09%     109.441us        30.74%       1.613ms     134.399us     763.297us        20.29%     818.210us      68.184us            12  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     763.297us        20.29%     763.297us      63.608us            12  
+                                Activity Buffer Request        26.94%       1.413ms        26.94%       1.413ms       1.413ms      54.913us         1.46%      54.913us      54.913us             1  
+                                        aten::transpose         1.00%      52.652us         1.34%      70.433us       2.935us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.34%      17.781us         0.34%      17.781us       0.741us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.38%      19.980us         1.61%      84.581us       5.639us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.45%      76.201us         1.45%      76.201us       3.175us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         2.16%     113.102us         2.16%     113.102us       7.540us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.31%      16.430us         0.31%      16.430us       5.477us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.03%       1.751us         0.03%       1.751us       0.292us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.07%       3.771us         0.07%       3.771us       1.257us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        58.27%       3.058ms        58.27%       3.058ms       3.058ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 5.273ms
-Self CUDA time total: 3.732ms
+Self CPU time total: 5.247ms
+Self CUDA time total: 3.762ms
 
 
 
@@ -4036,29 +4036,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L320_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                         torch_flash_ma         4.77%     251.162us        41.45%       2.184ms       2.184ms       0.000us         0.00%       3.786ms       3.786ms             1  
-                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.738ms       100.28%       3.738ms       3.738ms             1  
-                     aten::scaled_dot_product_attention         0.46%      24.280us         3.42%     180.086us      60.029us       0.000us         0.00%       2.949ms     982.872us             3  
-              aten::_scaled_dot_product_flash_attention         0.34%      18.160us         2.96%     155.806us      51.935us       0.000us         0.00%       2.949ms     982.872us             3  
-                         aten::_flash_attention_forward         0.73%      38.599us         2.20%     115.865us      38.622us       2.949ms        79.09%       2.949ms     982.872us             3  
-void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       2.949ms        79.09%       2.949ms     982.872us             3  
-                                       aten::contiguous         0.17%       8.991us        32.44%       1.710ms     142.465us       0.000us         0.00%     837.719us      69.810us            12  
-                                            aten::clone         0.53%      27.728us        32.27%       1.701ms     141.715us       0.000us         0.00%     837.719us      69.810us            12  
-                                            aten::copy_         1.52%      79.873us        30.57%       1.611ms     134.242us     779.480us        20.91%     837.719us      69.810us            12  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     779.480us        20.91%     779.480us      64.957us            12  
-                                Activity Buffer Request        27.50%       1.449ms        27.50%       1.449ms       1.449ms      58.239us         1.56%      58.239us      58.239us             1  
-                                        aten::transpose         0.92%      48.219us         1.24%      65.252us       2.719us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.32%      17.033us         0.32%      17.033us       0.710us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.37%      19.303us         1.55%      81.795us       5.453us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.44%      76.031us         1.44%      76.031us       3.168us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         1.98%     104.564us         1.98%     104.564us       6.971us       0.000us         0.00%       0.000us       0.000us            15  
-                                    aten::empty_strided         0.28%      14.492us         0.28%      14.492us       4.831us       0.000us         0.00%       0.000us       0.000us             3  
-                                 cudaDeviceGetAttribute         0.04%       1.860us         0.04%       1.860us       0.310us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.10%       5.030us         0.10%       5.030us       1.677us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        58.55%       3.085ms        58.55%       3.085ms       3.085ms       0.000us         0.00%       0.000us       0.000us             1  
+                                         torch_flash_ma         4.50%     237.986us        41.18%       2.178ms       2.178ms       0.000us         0.00%       3.833ms       3.833ms             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.785ms       100.29%       3.785ms       3.785ms             1  
+                     aten::scaled_dot_product_attention         0.46%      24.381us         3.40%     179.915us      59.972us       0.000us         0.00%       2.998ms     999.221us             3  
+              aten::_scaled_dot_product_flash_attention         0.36%      19.171us         2.94%     155.534us      51.845us       0.000us         0.00%       2.998ms     999.221us             3  
+                         aten::_flash_attention_forward         0.65%      34.259us         2.15%     113.691us      37.897us       2.998ms        79.44%       2.998ms     999.221us             3  
+void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       2.998ms        79.44%       2.998ms     999.221us             3  
+                                       aten::contiguous         0.19%       9.800us        32.38%       1.712ms     142.708us       0.000us         0.00%     835.263us      69.605us            12  
+                                            aten::clone         0.53%      28.211us        32.20%       1.703ms     141.891us       0.000us         0.00%     835.263us      69.605us            12  
+                                            aten::copy_         1.60%      84.650us        30.46%       1.611ms     134.247us     776.063us        20.56%     835.263us      69.605us            12  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     776.063us        20.56%     776.063us      64.672us            12  
+                                Activity Buffer Request        27.18%       1.437ms        27.18%       1.437ms       1.437ms      59.200us         1.57%      59.200us      59.200us             1  
+                                        aten::transpose         0.99%      52.225us         1.33%      70.125us       2.922us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.34%      17.900us         0.34%      17.900us       0.746us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.37%      19.782us         1.60%      84.803us       5.654us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.45%      76.431us         1.45%      76.431us       3.185us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         2.16%     114.204us         2.16%     114.204us       7.614us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.30%      16.100us         0.30%      16.100us       5.367us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.03%       1.730us         0.03%       1.730us       0.288us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.07%       3.730us         0.07%       3.730us       1.243us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        58.82%       3.110ms        58.82%       3.110ms       3.110ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 5.269ms
-Self CUDA time total: 3.728ms
+Self CPU time total: 5.288ms
+Self CUDA time total: 3.774ms
 
 
 
@@ -4068,29 +4068,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L384_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                         torch_flash_ma         5.01%     280.573us        44.17%       2.475ms       2.475ms       0.000us         0.00%       3.878ms       3.878ms             1  
-                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.831ms       100.27%       3.831ms       3.831ms             1  
-                     aten::scaled_dot_product_attention         0.48%      26.630us         3.39%     189.956us      63.319us       0.000us         0.00%       3.032ms       1.011ms             3  
-              aten::_scaled_dot_product_flash_attention         0.34%      19.101us         2.91%     163.326us      54.442us       0.000us         0.00%       3.032ms       1.011ms             3  
-                         aten::_flash_attention_forward         0.70%      39.063us         2.15%     120.325us      40.108us       3.032ms        79.37%       3.032ms       1.011ms             3  
-void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.032ms        79.37%       3.032ms       1.011ms             3  
-                                       aten::contiguous         0.17%       9.271us        34.98%       1.960ms     163.354us       0.000us         0.00%     845.820us      70.485us            12  
-                                            aten::clone         0.52%      28.974us        34.82%       1.951ms     162.581us       0.000us         0.00%     845.820us      70.485us            12  
-                                            aten::copy_         1.48%      83.180us        33.17%       1.859ms     154.908us     788.284us        20.63%     845.820us      70.485us            12  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     788.284us        20.63%     788.284us      65.690us            12  
-                                Activity Buffer Request        26.18%       1.467ms        26.18%       1.467ms       1.467ms      57.536us         1.51%      57.536us      57.536us             1  
-                                        aten::transpose         0.89%      50.110us         1.21%      67.952us       2.831us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.32%      17.842us         0.32%      17.842us       0.743us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.36%      19.969us         1.53%      85.492us       5.699us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.37%      76.982us         1.37%      76.982us       3.208us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         5.95%     333.480us         5.95%     333.480us      22.232us       0.000us         0.00%       0.000us       0.000us            15  
-                                    aten::empty_strided         0.30%      17.041us         0.30%      17.041us       5.680us       0.000us         0.00%       0.000us       0.000us             3  
-                                 cudaDeviceGetAttribute         0.03%       1.700us         0.03%       1.700us       0.283us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.07%       4.040us         0.07%       4.040us       1.347us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        55.83%       3.129ms        55.83%       3.129ms       3.129ms       0.000us         0.00%       0.000us       0.000us             1  
+                                         torch_flash_ma         4.36%     241.837us        43.33%       2.405ms       2.405ms       0.000us         0.00%       3.884ms       3.884ms             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.837ms       100.27%       3.837ms       3.837ms             1  
+                     aten::scaled_dot_product_attention         0.48%      26.802us         3.27%     181.715us      60.572us       0.000us         0.00%       3.042ms       1.014ms             3  
+              aten::_scaled_dot_product_flash_attention         0.35%      19.308us         2.79%     154.913us      51.638us       0.000us         0.00%       3.042ms       1.014ms             3  
+                         aten::_flash_attention_forward         0.60%      33.361us         2.03%     112.712us      37.571us       3.042ms        79.50%       3.042ms       1.014ms             3  
+void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.042ms        79.50%       3.042ms       1.014ms             3  
+                                       aten::contiguous         0.17%       9.659us        34.84%       1.934ms     161.162us       0.000us         0.00%     841.829us      70.152us            12  
+                                            aten::clone         0.50%      27.830us        34.67%       1.924ms     160.357us       0.000us         0.00%     841.829us      70.152us            12  
+                                            aten::copy_         1.56%      86.702us        32.55%       1.807ms     150.547us     784.548us        20.50%     841.829us      70.152us            12  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     784.548us        20.50%     784.548us      65.379us            12  
+                                Activity Buffer Request        25.45%       1.413ms        25.45%       1.413ms       1.413ms      57.281us         1.50%      57.281us      57.281us             1  
+                                        aten::transpose         0.95%      52.620us         1.27%      70.404us       2.933us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.32%      17.784us         0.32%      17.784us       0.741us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.78%      43.221us         2.00%     111.194us       7.413us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.45%      80.673us         1.45%      80.673us       3.361us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         5.96%     331.078us         5.96%     331.078us      22.072us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.28%      15.800us         0.28%      15.800us       5.267us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.03%       1.730us         0.03%       1.730us       0.288us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.07%       3.850us         0.07%       3.850us       1.283us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        56.67%       3.146ms        56.67%       3.146ms       3.146ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 5.603ms
-Self CUDA time total: 3.820ms
+Self CPU time total: 5.551ms
+Self CUDA time total: 3.827ms
 
 
 
@@ -4100,29 +4100,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L448_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                         torch_flash_ma         5.07%     303.893us        39.93%       2.395ms       2.395ms       0.000us         0.00%       4.370ms       4.370ms             1  
-                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       4.320ms       100.25%       4.320ms       4.320ms             1  
-                     aten::scaled_dot_product_attention         0.41%      24.650us         3.07%     184.006us      61.335us       0.000us         0.00%       3.503ms       1.168ms             3  
-              aten::_scaled_dot_product_flash_attention         0.32%      19.311us         2.66%     159.356us      53.119us       0.000us         0.00%       3.503ms       1.168ms             3  
-                         aten::_flash_attention_forward         0.68%      40.911us         1.97%     118.205us      39.402us       3.503ms        81.28%       3.503ms       1.168ms             3  
-void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.503ms        81.28%       3.503ms       1.168ms             3  
-                                       aten::contiguous         0.15%       8.977us        31.04%       1.862ms     155.201us       0.000us         0.00%     867.581us      72.298us            12  
-                                            aten::clone         0.47%      28.114us        30.89%       1.853ms     154.453us       0.000us         0.00%     867.581us      72.298us            12  
-                                            aten::copy_         1.36%      81.500us        29.40%       1.764ms     146.991us     806.749us        18.72%     867.581us      72.298us            12  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     806.749us        18.72%     806.749us      67.229us            12  
-                                Activity Buffer Request        23.82%       1.429ms        23.82%       1.429ms       1.429ms      60.832us         1.41%      60.832us      60.832us             1  
-                                        aten::transpose         0.82%      49.363us         1.11%      66.863us       2.786us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.29%      17.500us         0.29%      17.500us       0.729us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.33%      20.081us         1.37%      82.424us       5.495us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.26%      75.593us         1.26%      75.593us       3.150us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         4.60%     275.759us         4.60%     275.759us      18.384us       0.000us         0.00%       0.000us       0.000us            15  
-                                    aten::empty_strided         0.25%      15.251us         0.25%      15.251us       5.084us       0.000us         0.00%       0.000us       0.000us             3  
-                                 cudaDeviceGetAttribute         0.03%       1.740us         0.03%       1.740us       0.290us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.06%       3.680us         0.06%       3.680us       1.227us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        60.07%       3.604ms        60.07%       3.604ms       3.604ms       0.000us         0.00%       0.000us       0.000us             1  
+                                         torch_flash_ma         4.46%     268.165us        40.09%       2.413ms       2.413ms       0.000us         0.00%       4.405ms       4.405ms             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       4.355ms       100.25%       4.355ms       4.355ms             1  
+                     aten::scaled_dot_product_attention         0.46%      27.642us         3.64%     218.806us      72.935us       0.000us         0.00%       3.540ms       1.180ms             3  
+              aten::_scaled_dot_product_flash_attention         0.75%      45.250us         3.18%     191.164us      63.721us       0.000us         0.00%       3.540ms       1.180ms             3  
+                         aten::_flash_attention_forward         0.61%      36.651us         2.01%     120.923us      40.308us       3.540ms        81.48%       3.540ms       1.180ms             3  
+void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.540ms        81.48%       3.540ms       1.180ms             3  
+                                       aten::contiguous         0.18%      10.862us        31.11%       1.873ms     156.050us       0.000us         0.00%     865.606us      72.134us            12  
+                                            aten::clone         0.51%      30.490us        30.93%       1.862ms     155.145us       0.000us         0.00%     865.606us      72.134us            12  
+                                            aten::copy_         1.51%      90.931us        29.34%       1.766ms     147.155us     804.645us        18.52%     865.606us      72.134us            12  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     804.645us        18.52%     804.645us      67.054us            12  
+                                Activity Buffer Request        21.61%       1.300ms        21.61%       1.300ms       1.300ms      60.961us         1.40%      60.961us      60.961us             1  
+                                        aten::transpose         0.99%      59.753us         1.30%      78.501us       3.271us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.31%      18.748us         0.31%      18.748us       0.781us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.35%      20.935us         1.45%      87.165us       5.811us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.32%      79.690us         1.32%      79.690us       3.320us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         6.67%     401.680us         6.67%     401.680us      26.779us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.27%      16.081us         0.27%      16.081us       5.360us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.03%       2.030us         0.03%       2.030us       0.338us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.06%       3.810us         0.06%       3.810us       1.270us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        59.91%       3.605ms        59.91%       3.605ms       3.605ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 5.999ms
-Self CUDA time total: 4.309ms
+Self CPU time total: 6.018ms
+Self CUDA time total: 4.344ms
 
 
 
@@ -4132,39 +4132,91 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L512_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                         torch_flash_ma         3.83%     232.270us        37.82%       2.296ms       2.296ms       0.000us         0.00%       4.474ms       4.474ms             1  
-                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       4.423ms       100.25%       4.423ms       4.423ms             1  
-                     aten::scaled_dot_product_attention         0.41%      24.850us         2.85%     172.746us      57.582us       0.000us         0.00%       3.595ms       1.198ms             3  
-              aten::_scaled_dot_product_flash_attention         0.30%      18.250us         2.44%     147.896us      49.299us       0.000us         0.00%       3.595ms       1.198ms             3  
-                         aten::_flash_attention_forward         0.54%      32.692us         1.77%     107.224us      35.741us       3.595ms        81.48%       3.595ms       1.198ms             3  
-void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.595ms        81.48%       3.595ms       1.198ms             3  
-                                       aten::contiguous         0.14%       8.610us        30.41%       1.846ms     153.859us       0.000us         0.00%     878.139us      73.178us            12  
-                                            aten::clone         0.45%      27.368us        30.27%       1.838ms     153.142us       0.000us         0.00%     878.139us      73.178us            12  
-                                            aten::copy_         1.35%      81.917us        28.83%       1.750ms     145.831us     817.083us        18.52%     878.139us      73.178us            12  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     817.083us        18.52%     817.083us      68.090us            12  
-                                Activity Buffer Request        23.72%       1.440ms        23.72%       1.440ms       1.440ms      61.056us         1.38%      61.056us      61.056us             1  
-                                        aten::transpose         0.82%      50.064us         1.10%      66.792us       2.783us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.28%      16.728us         0.28%      16.728us       0.697us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.32%      19.431us         1.31%      79.591us       5.306us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.21%      73.220us         1.21%      73.220us       3.051us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         4.12%     249.950us         4.12%     249.950us      16.663us       0.000us         0.00%       0.000us       0.000us            15  
-                                    aten::empty_strided         0.24%      14.270us         0.24%      14.270us       4.757us       0.000us         0.00%       0.000us       0.000us             3  
-                                 cudaDeviceGetAttribute         0.03%       1.680us         0.03%       1.680us       0.280us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.07%       4.380us         0.07%       4.380us       1.460us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        62.18%       3.775ms        62.18%       3.775ms       3.775ms       0.000us         0.00%       0.000us       0.000us             1  
+                                         torch_flash_ma         4.01%     246.839us        39.75%       2.447ms       2.447ms       0.000us         0.00%       4.458ms       4.458ms             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       4.407ms       100.23%       4.407ms       4.407ms             1  
+                     aten::scaled_dot_product_attention         0.40%      24.621us         2.95%     181.474us      60.491us       0.000us         0.00%       3.579ms       1.193ms             3  
+              aten::_scaled_dot_product_flash_attention         0.34%      20.980us         2.55%     156.853us      52.284us       0.000us         0.00%       3.579ms       1.193ms             3  
+                         aten::_flash_attention_forward         0.58%      35.588us         1.84%     113.003us      37.668us       3.579ms        81.40%       3.579ms       1.193ms             3  
+void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.579ms        81.40%       3.579ms       1.193ms             3  
+                                       aten::contiguous         0.16%      10.061us        32.01%       1.971ms     164.244us       0.000us         0.00%     878.818us      73.235us            12  
+                                            aten::clone         0.50%      30.903us        31.85%       1.961ms     163.406us       0.000us         0.00%     878.818us      73.235us            12  
+                                            aten::copy_         1.35%      82.841us        30.27%       1.864ms     155.305us     817.634us        18.60%     878.818us      73.235us            12  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     817.634us        18.60%     817.634us      68.136us            12  
+                                Activity Buffer Request        23.50%       1.447ms        23.50%       1.447ms       1.447ms      61.184us         1.39%      61.184us      61.184us             1  
+                                        aten::transpose         0.85%      52.630us         1.15%      70.790us       2.950us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.29%      18.160us         0.29%      18.160us       0.757us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.33%      20.456us         1.41%      86.700us       5.780us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.28%      78.794us         1.28%      78.794us       3.283us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         5.81%     357.919us         5.81%     357.919us      23.861us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.25%      15.401us         0.25%      15.401us       5.134us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.03%       1.632us         0.03%       1.632us       0.272us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.06%       3.720us         0.06%       3.720us       1.240us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        60.25%       3.709ms        60.25%       3.709ms       3.709ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 6.071ms
-Self CUDA time total: 4.413ms
+Self CPU time total: 6.156ms
+Self CUDA time total: 4.397ms
 
 
 impl                     wl                  p50(ms)  ok
 torch_flash_ma           cuda_attn_L128_bfloat16     1.22  True
-torch_flash_ma           cuda_attn_L256_bfloat16     1.27  True
-torch_flash_ma           cuda_attn_L320_bfloat16     1.28  True
-torch_flash_ma           cuda_attn_L384_bfloat16     1.31  True
+torch_flash_ma           cuda_attn_L256_bfloat16     1.28  True
+torch_flash_ma           cuda_attn_L320_bfloat16     1.29  True
+torch_flash_ma           cuda_attn_L384_bfloat16     1.33  True
 torch_flash_ma           cuda_attn_L448_bfloat16     1.47  True
 torch_flash_ma           cuda_attn_L512_bfloat16     1.50  True
 </pre></div>
+<div class="uv-install-logs" id="uv-logs-benchmark">
+<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
+<div class="uv-logs-content" style="display: none;">
+   Building kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
+Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
+Downloading matplotlib (8.3MiB)
+Downloading nvidia-cufile-cu12 (1.1MiB)
+Downloading numpy (16.2MiB)
+Downloading nvidia-cudnn-cu12 (674.0MiB)
+Downloading nvidia-cusparse-cu12 (274.9MiB)
+Downloading nvidia-cublas-cu12 (566.8MiB)
+Downloading nvidia-cusparselt-cu12 (273.9MiB)
+Downloading nvidia-nvjitlink-cu12 (37.4MiB)
+Downloading kiwisolver (1.4MiB)
+Downloading networkx (1.9MiB)
+Downloading nvidia-curand-cu12 (60.7MiB)
+Downloading sympy (6.0MiB)
+Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
+Downloading setuptools (1.1MiB)
+Downloading nvidia-cusolver-cu12 (255.1MiB)
+Downloading triton (148.3MiB)
+Downloading pillow (6.7MiB)
+Downloading nvidia-cufft-cu12 (184.2MiB)
+Downloading fonttools (4.7MiB)
+Downloading nvidia-nccl-cu12 (307.4MiB)
+Downloading torch (846.9MiB)
+ Downloading nvidia-cufile-cu12
+ Downloading kiwisolver
+ Downloading setuptools
+ Downloading networkx
+ Downloading fonttools
+ Downloading pillow
+      Built kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
+ Downloading nvidia-cuda-cupti-cu12
+ Downloading matplotlib
+ Downloading numpy
+ Downloading sympy
+ Downloading nvidia-nvjitlink-cu12
+ Downloading nvidia-curand-cu12
+ Downloading nvidia-cuda-nvrtc-cu12
+ Downloading triton
+ Downloading nvidia-cufft-cu12
+ Downloading nvidia-cusolver-cu12
+ Downloading nvidia-cusparse-cu12
+ Downloading nvidia-cusparselt-cu12
+ Downloading nvidia-nccl-cu12
+ Downloading nvidia-cublas-cu12
+ Downloading nvidia-cudnn-cu12
+ Downloading torch
+Installed 37 packages in 212ms
+</div>
+</div>
 <div class="cell-artifacts">
 <h4>Artifacts:</h4>
 <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
diff --git a/flash_attn/impls/hf_kernels_flash_attn.html b/flash_attn/impls/hf_kernels_flash_attn.html
index 6414b268459e56cf2a96ef4b229b35fde2e104fa..7d03567858952d02de89e25ce04873ef34373a75 100644
--- a/flash_attn/impls/hf_kernels_flash_attn.html
+++ b/flash_attn/impls/hf_kernels_flash_attn.html
@@ -3869,9 +3869,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
 <span class="collapse-indicators">
 <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span> 
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
- <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
+ <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> | 
-Cell: benchmark | 6.08s
+Cell: benchmark | 5.58s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3926,21 +3926,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L128_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_flash_attn         3.64%     160.058us        41.50%       1.823ms       1.823ms       0.000us         0.00%       3.744ms       3.744ms             1  
-                               _flash_attn_9e27194::fwd         1.78%      78.347us        37.86%       1.663ms     554.208us       2.792ms       100.00%       3.744ms       1.248ms             3  
-                                  hf_kernels_flash_attn         0.00%       0.000us         0.00%       0.000us       0.000us       2.794ms       100.05%       2.794ms       2.794ms             1  
-void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits...         0.00%       0.000us         0.00%       0.000us       0.000us       2.792ms       100.00%       2.792ms     930.800us             3  
-                                Activity Buffer Request        33.00%       1.449ms        33.00%       1.449ms       1.449ms     951.685us        34.08%     951.685us     951.685us             1  
-                                 cudaDeviceGetAttribute         0.13%       5.638us         0.13%       5.638us       0.376us       0.000us         0.00%       0.000us       0.000us            15  
-                                       aten::empty_like         0.40%      17.551us         1.19%      52.122us      17.374us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         0.79%      34.571us         0.79%      34.571us      11.524us       0.000us         0.00%       0.000us       0.000us             3  
-                                            aten::empty         0.57%      24.890us         0.57%      24.890us       2.766us       0.000us         0.00%       0.000us       0.000us             9  
-                                   cudaFuncSetAttribute         0.28%      12.210us         0.28%      12.210us       4.070us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         0.92%      40.292us         0.92%      40.292us      13.431us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        58.50%       2.569ms        58.50%       2.569ms       2.569ms       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_flash_attn         3.55%     156.153us        41.08%       1.807ms       1.807ms       0.000us         0.00%       3.775ms       3.775ms             1  
+                               _flash_attn_9e27194::fwd         1.65%      72.542us        37.53%       1.651ms     550.240us       2.812ms       100.00%       3.775ms       1.258ms             3  
+                                  hf_kernels_flash_attn         0.00%       0.000us         0.00%       0.000us       0.000us       2.814ms       100.05%       2.814ms       2.814ms             1  
+void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits...         0.00%       0.000us         0.00%       0.000us       0.000us       2.812ms       100.00%       2.812ms     937.398us             3  
+                                Activity Buffer Request        32.22%       1.417ms        32.22%       1.417ms       1.417ms     962.880us        34.24%     962.880us     962.880us             1  
+                                 cudaDeviceGetAttribute         0.13%       5.500us         0.13%       5.500us       0.367us       0.000us         0.00%       0.000us       0.000us            15  
+                                       aten::empty_like         0.43%      19.110us         1.25%      54.882us      18.294us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         0.81%      35.772us         0.81%      35.772us      11.924us       0.000us         0.00%       0.000us       0.000us             3  
+                                            aten::empty         0.57%      25.101us         0.57%      25.101us       2.789us       0.000us         0.00%       0.000us       0.000us             9  
+                                   cudaFuncSetAttribute         0.30%      13.270us         0.30%      13.270us       4.423us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.42%      62.402us         1.42%      62.402us      20.801us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        58.92%       2.591ms        58.92%       2.591ms       2.591ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 4.392ms
-Self CUDA time total: 2.792ms
+Self CPU time total: 4.398ms
+Self CUDA time total: 2.812ms
 
 
 
@@ -3950,21 +3950,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L256_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_flash_attn         2.22%      99.144us        37.48%       1.673ms       1.673ms       0.000us         0.00%       3.949ms       3.949ms             1  
-                               _flash_attn_9e27194::fwd         1.20%      53.462us        35.26%       1.574ms     524.654us       2.953ms       100.00%       3.949ms       1.316ms             3  
-                                  hf_kernels_flash_attn         0.00%       0.000us         0.00%       0.000us       0.000us       2.955ms       100.05%       2.955ms       2.955ms             1  
-void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits...         0.00%       0.000us         0.00%       0.000us       0.000us       2.953ms       100.00%       2.953ms     984.436us             3  
-                                Activity Buffer Request        32.23%       1.439ms        32.23%       1.439ms       1.439ms     995.807us        33.72%     995.807us     995.807us             1  
-                                 cudaDeviceGetAttribute         0.10%       4.621us         0.10%       4.621us       0.308us       0.000us         0.00%       0.000us       0.000us            15  
-                                       aten::empty_like         0.17%       7.710us         0.56%      24.861us       8.287us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         0.38%      17.151us         0.38%      17.151us       5.717us       0.000us         0.00%       0.000us       0.000us             3  
-                                            aten::empty         0.47%      21.122us         0.47%      21.122us       2.347us       0.000us         0.00%       0.000us       0.000us             9  
-                                   cudaFuncSetAttribute         0.08%       3.791us         0.08%       3.791us       1.264us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         0.61%      27.380us         0.61%      27.380us       9.127us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        62.52%       2.791ms        62.52%       2.791ms       2.791ms       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_flash_attn         2.04%      91.192us        36.62%       1.634ms       1.634ms       0.000us         0.00%       3.983ms       3.983ms             1  
+                               _flash_attn_9e27194::fwd         1.11%      49.718us        34.57%       1.543ms     514.203us       2.978ms       100.00%       3.983ms       1.328ms             3  
+                                  hf_kernels_flash_attn         0.00%       0.000us         0.00%       0.000us       0.000us       2.980ms       100.05%       2.980ms       2.980ms             1  
+void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits...         0.00%       0.000us         0.00%       0.000us       0.000us       2.978ms       100.00%       2.978ms     992.707us             3  
+                                Activity Buffer Request        31.74%       1.416ms        31.74%       1.416ms       1.416ms       1.004ms        33.73%       1.004ms       1.004ms             1  
+                                 cudaDeviceGetAttribute         0.08%       3.711us         0.08%       3.711us       0.247us       0.000us         0.00%       0.000us       0.000us            15  
+                                       aten::empty_like         0.17%       7.481us         0.51%      22.841us       7.614us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         0.34%      15.360us         0.34%      15.360us       5.120us       0.000us         0.00%       0.000us       0.000us             3  
+                                            aten::empty         0.46%      20.620us         0.46%      20.620us       2.291us       0.000us         0.00%       0.000us       0.000us             9  
+                                   cudaFuncSetAttribute         0.08%       3.741us         0.08%       3.741us       1.247us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         0.58%      25.842us         0.58%      25.842us       8.614us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        63.38%       2.828ms        63.38%       2.828ms       2.828ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 4.464ms
-Self CUDA time total: 2.953ms
+Self CPU time total: 4.462ms
+Self CUDA time total: 2.978ms
 
 
 
@@ -3974,21 +3974,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L320_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_flash_attn         2.58%     116.955us        37.54%       1.702ms       1.702ms       0.000us         0.00%       4.041ms       4.041ms             1  
-                               _flash_attn_9e27194::fwd         1.53%      69.255us        34.96%       1.585ms     528.314us       3.010ms       100.00%       4.041ms       1.347ms             3  
-                                  hf_kernels_flash_attn         0.00%       0.000us         0.00%       0.000us       0.000us       3.012ms       100.05%       3.012ms       3.012ms             1  
-void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits...         0.00%       0.000us         0.00%       0.000us       0.000us       3.010ms       100.00%       3.010ms       1.003ms             3  
-                                Activity Buffer Request        31.53%       1.430ms        31.53%       1.430ms       1.430ms       1.031ms        34.26%       1.031ms       1.031ms             1  
-                                 cudaDeviceGetAttribute         0.10%       4.450us         0.10%       4.450us       0.297us       0.000us         0.00%       0.000us       0.000us            15  
-                                       aten::empty_like         0.18%       8.151us         0.57%      25.801us       8.600us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         0.39%      17.650us         0.39%      17.650us       5.883us       0.000us         0.00%       0.000us       0.000us             3  
-                                            aten::empty         0.48%      21.771us         0.48%      21.771us       2.419us       0.000us         0.00%       0.000us       0.000us             9  
-                                   cudaFuncSetAttribute         0.10%       4.360us         0.10%       4.360us       1.453us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         0.66%      29.790us         0.66%      29.790us       9.930us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        62.46%       2.832ms        62.46%       2.832ms       2.832ms       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_flash_attn         2.28%     105.284us        36.17%       1.673ms       1.673ms       0.000us         0.00%       4.145ms       4.145ms             1  
+                               _flash_attn_9e27194::fwd         1.09%      50.271us        33.89%       1.567ms     522.459us       3.096ms       100.00%       4.145ms       1.382ms             3  
+                                  hf_kernels_flash_attn         0.00%       0.000us         0.00%       0.000us       0.000us       3.098ms       100.05%       3.098ms       3.098ms             1  
+void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits...         0.00%       0.000us         0.00%       0.000us       0.000us       3.096ms       100.00%       3.096ms       1.032ms             3  
+                                Activity Buffer Request        31.08%       1.437ms        31.08%       1.437ms       1.437ms       1.049ms        33.87%       1.049ms       1.049ms             1  
+                                 cudaDeviceGetAttribute         0.08%       3.850us         0.08%       3.850us       0.257us       0.000us         0.00%       0.000us       0.000us            15  
+                                       aten::empty_like         0.15%       7.061us         0.49%      22.631us       7.544us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         0.34%      15.570us         0.34%      15.570us       5.190us       0.000us         0.00%       0.000us       0.000us             3  
+                                            aten::empty         0.47%      21.760us         0.47%      21.760us       2.418us       0.000us         0.00%       0.000us       0.000us             9  
+                                   cudaFuncSetAttribute         0.08%       3.689us         0.08%       3.689us       1.230us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         0.61%      27.992us         0.61%      27.992us       9.331us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        63.83%       2.952ms        63.83%       2.952ms       2.952ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 4.534ms
-Self CUDA time total: 3.010ms
+Self CPU time total: 4.625ms
+Self CUDA time total: 3.096ms
 
 
 
@@ -3998,21 +3998,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L384_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_flash_attn         2.39%     114.805us        40.03%       1.925ms       1.925ms       0.000us         0.00%       4.094ms       4.094ms             1  
-                               _flash_attn_9e27194::fwd         1.09%      52.653us        37.65%       1.810ms     603.407us       3.063ms       100.00%       4.094ms       1.365ms             3  
-                                  hf_kernels_flash_attn         0.00%       0.000us         0.00%       0.000us       0.000us       3.065ms       100.05%       3.065ms       3.065ms             1  
-void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits...         0.00%       0.000us         0.00%       0.000us       0.000us       3.063ms       100.00%       3.063ms       1.021ms             3  
-                                Activity Buffer Request        29.78%       1.432ms        29.78%       1.432ms       1.432ms       1.031ms        33.65%       1.031ms       1.031ms             1  
-                                 cudaDeviceGetAttribute         0.10%       4.861us         0.10%       4.861us       0.324us       0.000us         0.00%       0.000us       0.000us            15  
-                                       aten::empty_like         0.16%       7.720us         0.55%      26.331us       8.777us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         0.39%      18.611us         0.39%      18.611us       6.204us       0.000us         0.00%       0.000us       0.000us             3  
-                                            aten::empty         0.45%      21.731us         0.45%      21.731us       2.415us       0.000us         0.00%       0.000us       0.000us             9  
-                                   cudaFuncSetAttribute         0.08%       3.728us         0.08%       3.728us       1.243us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         5.59%     268.862us         5.59%     268.862us      89.621us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        59.97%       2.884ms        59.97%       2.884ms       2.884ms       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_flash_attn         2.30%     110.882us        38.29%       1.842ms       1.842ms       0.000us         0.00%       4.161ms       4.161ms             1  
+                               _flash_attn_9e27194::fwd         1.05%      50.321us        35.98%       1.731ms     577.014us       3.117ms       100.00%       4.161ms       1.387ms             3  
+                                  hf_kernels_flash_attn         0.00%       0.000us         0.00%       0.000us       0.000us       3.118ms       100.05%       3.118ms       3.118ms             1  
+void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits...         0.00%       0.000us         0.00%       0.000us       0.000us       3.117ms       100.00%       3.117ms       1.039ms             3  
+                                Activity Buffer Request        29.64%       1.426ms        29.64%       1.426ms       1.426ms       1.044ms        33.50%       1.044ms       1.044ms             1  
+                                 cudaDeviceGetAttribute         0.08%       3.780us         0.08%       3.780us       0.252us       0.000us         0.00%       0.000us       0.000us            15  
+                                       aten::empty_like         0.15%       7.259us         0.50%      24.240us       8.080us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         0.35%      16.981us         0.35%      16.981us       5.660us       0.000us         0.00%       0.000us       0.000us             3  
+                                            aten::empty         0.45%      21.602us         0.45%      21.602us       2.400us       0.000us         0.00%       0.000us       0.000us             9  
+                                   cudaFuncSetAttribute         0.08%       3.770us         0.08%       3.770us       1.257us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         4.18%     201.205us         4.18%     201.205us      67.068us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        61.71%       2.969ms        61.71%       2.969ms       2.969ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 4.809ms
-Self CUDA time total: 3.063ms
+Self CPU time total: 4.811ms
+Self CUDA time total: 3.117ms
 
 
 
@@ -4022,21 +4022,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L448_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_flash_attn         2.13%     113.755us        35.84%       1.918ms       1.918ms       0.000us         0.00%       4.786ms       4.786ms             1  
-                               _flash_attn_9e27194::fwd         1.02%      54.483us        33.71%       1.804ms     601.364us       3.588ms       100.00%       4.786ms       1.595ms             3  
-                                  hf_kernels_flash_attn         0.00%       0.000us         0.00%       0.000us       0.000us       3.590ms       100.04%       3.590ms       3.590ms             1  
-void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits...         0.00%       0.000us         0.00%       0.000us       0.000us       3.588ms       100.00%       3.588ms       1.196ms             3  
-                                Activity Buffer Request        26.99%       1.445ms        26.99%       1.445ms       1.445ms       1.198ms        33.38%       1.198ms       1.198ms             1  
-                                 cudaDeviceGetAttribute         0.08%       4.270us         0.08%       4.270us       0.285us       0.000us         0.00%       0.000us       0.000us            15  
-                                       aten::empty_like         0.15%       8.039us         0.48%      25.640us       8.547us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         0.33%      17.601us         0.33%      17.601us       5.867us       0.000us         0.00%       0.000us       0.000us             3  
-                                            aten::empty         0.40%      21.582us         0.40%      21.582us       2.398us       0.000us         0.00%       0.000us       0.000us             9  
-                                   cudaFuncSetAttribute         0.07%       3.700us         0.07%       3.700us       1.233us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         4.67%     249.891us         4.67%     249.891us      83.297us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        64.16%       3.434ms        64.16%       3.434ms       3.434ms       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_flash_attn         2.05%     108.443us        34.64%       1.832ms       1.832ms       0.000us         0.00%       4.810ms       4.810ms             1  
+                               _flash_attn_9e27194::fwd         0.96%      50.812us        32.59%       1.723ms     574.364us       3.602ms       100.00%       4.810ms       1.603ms             3  
+                                  hf_kernels_flash_attn         0.00%       0.000us         0.00%       0.000us       0.000us       3.603ms       100.04%       3.603ms       3.603ms             1  
+void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits...         0.00%       0.000us         0.00%       0.000us       0.000us       3.602ms       100.00%       3.602ms       1.201ms             3  
+                                Activity Buffer Request        27.53%       1.455ms        27.53%       1.455ms       1.455ms       1.209ms        33.55%       1.209ms       1.209ms             1  
+                                 cudaDeviceGetAttribute         0.08%       4.070us         0.08%       4.070us       0.271us       0.000us         0.00%       0.000us       0.000us            15  
+                                       aten::empty_like         0.14%       7.390us         0.45%      23.900us       7.967us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         0.31%      16.510us         0.31%      16.510us       5.503us       0.000us         0.00%       0.000us       0.000us             3  
+                                            aten::empty         0.40%      21.151us         0.40%      21.151us       2.350us       0.000us         0.00%       0.000us       0.000us             9  
+                                   cudaFuncSetAttribute         0.07%       3.770us         0.07%       3.770us       1.257us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         3.10%     164.023us         3.10%     164.023us      54.674us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        65.36%       3.455ms        65.36%       3.455ms       3.455ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 5.351ms
-Self CUDA time total: 3.588ms
+Self CPU time total: 5.287ms
+Self CUDA time total: 3.602ms
 
 
 
@@ -4046,41 +4046,36 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L512_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_flash_attn         2.08%     111.044us        35.25%       1.879ms       1.879ms       0.000us         0.00%       4.816ms       4.816ms             1  
-                               _flash_attn_9e27194::fwd         0.99%      52.834us        33.17%       1.768ms     589.427us       3.606ms       100.00%       4.816ms       1.605ms             3  
-                                  hf_kernels_flash_attn         0.00%       0.000us         0.00%       0.000us       0.000us       3.607ms       100.05%       3.607ms       3.607ms             1  
-void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits...         0.00%       0.000us         0.00%       0.000us       0.000us       3.606ms       100.00%       3.606ms       1.202ms             3  
-                                Activity Buffer Request        26.56%       1.416ms        26.56%       1.416ms       1.416ms       1.210ms        33.55%       1.210ms       1.210ms             1  
-                                 cudaDeviceGetAttribute         0.08%       4.460us         0.08%       4.460us       0.297us       0.000us         0.00%       0.000us       0.000us            15  
-                                       aten::empty_like         0.14%       7.500us         0.49%      26.051us       8.684us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         0.35%      18.551us         0.35%      18.551us       6.184us       0.000us         0.00%       0.000us       0.000us             3  
-                                            aten::empty         0.41%      21.960us         0.41%      21.960us       2.440us       0.000us         0.00%       0.000us       0.000us             9  
-                                   cudaFuncSetAttribute         0.08%       4.009us         0.08%       4.009us       1.336us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         4.55%     242.792us         4.55%     242.792us      80.931us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        64.75%       3.452ms        64.75%       3.452ms       3.452ms       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_flash_attn         1.95%     105.103us        34.11%       1.836ms       1.836ms       0.000us         0.00%       4.931ms       4.931ms             1  
+                               _flash_attn_9e27194::fwd         1.08%      58.141us        32.16%       1.731ms     577.087us       3.693ms       100.00%       4.931ms       1.644ms             3  
+                                  hf_kernels_flash_attn         0.00%       0.000us         0.00%       0.000us       0.000us       3.695ms       100.04%       3.695ms       3.695ms             1  
+void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits...         0.00%       0.000us         0.00%       0.000us       0.000us       3.693ms       100.00%       3.693ms       1.231ms             3  
+                                Activity Buffer Request        26.71%       1.438ms        26.71%       1.438ms       1.438ms       1.238ms        33.53%       1.238ms       1.238ms             1  
+                                 cudaDeviceGetAttribute         0.08%       4.380us         0.08%       4.380us       0.292us       0.000us         0.00%       0.000us       0.000us            15  
+                                       aten::empty_like         0.15%       8.230us         0.50%      26.750us       8.917us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         0.34%      18.520us         0.34%      18.520us       6.173us       0.000us         0.00%       0.000us       0.000us             3  
+                                            aten::empty         0.48%      25.961us         0.48%      25.961us       2.885us       0.000us         0.00%       0.000us       0.000us             9  
+                                   cudaFuncSetAttribute         0.08%       4.220us         0.08%       4.220us       1.407us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         3.23%     173.714us         3.23%     173.714us      57.905us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        65.89%       3.548ms        65.89%       3.548ms       3.548ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 5.332ms
-Self CUDA time total: 3.606ms
+Self CPU time total: 5.384ms
+Self CUDA time total: 3.693ms
 
 
 impl                     wl                  p50(ms)  ok
-hf_kernels_flash_attn    cuda_attn_L128_bfloat16     0.96  True
+hf_kernels_flash_attn    cuda_attn_L128_bfloat16     0.97  True
 hf_kernels_flash_attn    cuda_attn_L256_bfloat16     1.01  True
 hf_kernels_flash_attn    cuda_attn_L320_bfloat16     1.06  True
-hf_kernels_flash_attn    cuda_attn_L384_bfloat16     1.05  True
-hf_kernels_flash_attn    cuda_attn_L448_bfloat16     1.22  True
-hf_kernels_flash_attn    cuda_attn_L512_bfloat16     1.21  True
+hf_kernels_flash_attn    cuda_attn_L384_bfloat16     1.09  True
+hf_kernels_flash_attn    cuda_attn_L448_bfloat16     1.24  True
+hf_kernels_flash_attn    cuda_attn_L512_bfloat16     1.23  True
 </pre></div>
-<div class="uv-install-logs" id="uv-logs-benchmark">
-<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
-<div class="uv-logs-content" style="display: none;">
-Installed 15 packages in 13ms
+<div class="cell-stderr">
+Fetching 20 files:   0%|          | 0/20 [00:00&lt;?, ?it/s]
+Fetching 20 files:  10%|█         | 2/20 [00:01&lt;00:13,  1.34it/s]
+Fetching 20 files: 100%|██████████| 20/20 [00:01&lt;00:00, 13.40it/s]
 </div>
-</div>
-<div class="cell-stderr">Fetching 20 files:   0%|          | 0/20 [00:00&lt;?, ?it/s]
-Fetching 20 files:   5%|▌         | 1/20 [00:00&lt;00:04,  4.26it/s]
-Fetching 20 files:  10%|█         | 2/20 [00:01&lt;00:17,  1.03it/s]
-Fetching 20 files: 100%|██████████| 20/20 [00:01&lt;00:00, 11.64it/s]</div>
 <div class="cell-artifacts">
 <h4>Artifacts:</h4>
 <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
diff --git a/flash_attn/impls/hf_kernels_flash_attn3.html b/flash_attn/impls/hf_kernels_flash_attn3.html
index 3b19dae40ca718ad81f3050d7a0de99c655bf943..889bda3eb9ecfa28e1bd79f67d85d1acc88d58a0 100644
--- a/flash_attn/impls/hf_kernels_flash_attn3.html
+++ b/flash_attn/impls/hf_kernels_flash_attn3.html
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> | 
-Cell: benchmark | 5.68s
+Cell: benchmark | 5.52s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3925,19 +3925,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L128_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                 hf_kernels_flash_attn3         3.89%     167.076us        44.49%       1.911ms       1.911ms       0.000us         0.00%       3.576ms       3.576ms             1  
-                                          FlashAttnFunc         3.00%     128.934us        40.60%       1.744ms     581.290us       0.000us         0.00%       3.576ms       1.192ms             3  
-                        _flash_attn3_48fe103_dirty::fwd         1.82%      78.184us        37.60%       1.615ms     538.312us       2.688ms       100.00%       3.576ms       1.192ms             3  
-                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us       2.690ms       100.05%       2.690ms       2.690ms             1  
-void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       2.688ms       100.00%       2.688ms     896.117us             3  
-                                Activity Buffer Request        33.29%       1.430ms        33.29%       1.430ms       1.430ms     887.327us        33.01%     887.327us     887.327us             1  
-                                            aten::empty         1.08%      46.281us         1.08%      46.281us       7.714us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.37%      15.900us         0.37%      15.900us       5.300us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         1.04%      44.671us         1.04%      44.671us      14.890us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        55.51%       2.384ms        55.51%       2.384ms       2.384ms       0.000us         0.00%       0.000us       0.000us             1  
+                                 hf_kernels_flash_attn3         3.72%     161.222us        44.67%       1.935ms       1.935ms       0.000us         0.00%       3.599ms       3.599ms             1  
+                                          FlashAttnFunc         2.81%     121.834us        40.95%       1.774ms     591.218us       0.000us         0.00%       3.599ms       1.200ms             3  
+                        _flash_attn3_48fe103_dirty::fwd         1.85%      79.992us        38.14%       1.652ms     550.607us       2.693ms       100.00%       3.599ms       1.200ms             3  
+                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us       2.695ms       100.05%       2.695ms       2.695ms             1  
+void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       2.693ms       100.00%       2.693ms     897.759us             3  
+                                Activity Buffer Request        33.93%       1.470ms        33.93%       1.470ms       1.470ms     905.439us        33.62%     905.439us     905.439us             1  
+                                            aten::empty         1.00%      43.311us         1.00%      43.311us       7.219us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.32%      13.891us         0.32%      13.891us       4.630us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.04%      45.121us         1.04%      45.121us      15.040us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        55.33%       2.396ms        55.33%       2.396ms       2.396ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 4.295ms
-Self CUDA time total: 2.688ms
+Self CPU time total: 4.331ms
+Self CUDA time total: 2.693ms
 
 
 
@@ -3947,19 +3947,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L256_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                 hf_kernels_flash_attn3         3.06%     130.754us        41.10%       1.758ms       1.758ms       0.000us         0.00%       3.668ms       3.668ms             1  
-                                          FlashAttnFunc         2.23%      95.572us        38.05%       1.627ms     542.455us       0.000us         0.00%       3.668ms       1.223ms             3  
-                        _flash_attn3_48fe103_dirty::fwd         1.23%      52.754us        35.81%       1.532ms     510.598us       2.747ms       100.00%       3.668ms       1.223ms             3  
-                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us       2.748ms       100.05%       2.748ms       2.748ms             1  
-void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       2.747ms       100.00%       2.747ms     915.501us             3  
-                                Activity Buffer Request        33.10%       1.416ms        33.10%       1.416ms       1.416ms     921.272us        33.54%     921.272us     921.272us             1  
-                                            aten::empty         0.63%      26.890us         0.63%      26.890us       4.482us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.12%       4.970us         0.12%       4.970us       1.657us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         0.73%      31.351us         0.73%      31.351us      10.450us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        58.90%       2.519ms        58.90%       2.519ms       2.519ms       0.000us         0.00%       0.000us       0.000us             1  
+                                 hf_kernels_flash_attn3         2.17%      96.772us        39.76%       1.770ms       1.770ms       0.000us         0.00%       3.876ms       3.876ms             1  
+                                          FlashAttnFunc         2.04%      90.694us        37.59%       1.674ms     557.834us       0.000us         0.00%       3.876ms       1.292ms             3  
+                        _flash_attn3_48fe103_dirty::fwd         1.15%      51.142us        35.55%       1.583ms     527.603us       2.896ms       100.00%       3.876ms       1.292ms             3  
+                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us       2.898ms       100.05%       2.898ms       2.898ms             1  
+void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       2.896ms       100.00%       2.896ms     965.387us             3  
+                                Activity Buffer Request        33.04%       1.471ms        33.04%       1.471ms       1.471ms     979.809us        33.83%     979.809us     979.809us             1  
+                                            aten::empty         0.58%      25.610us         0.58%      25.610us       4.268us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.12%       5.240us         0.12%       5.240us       1.747us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         0.67%      29.750us         0.67%      29.750us       9.917us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        60.24%       2.682ms        60.24%       2.682ms       2.682ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 4.277ms
-Self CUDA time total: 2.747ms
+Self CPU time total: 4.452ms
+Self CUDA time total: 2.896ms
 
 
 
@@ -3969,19 +3969,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L320_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                 hf_kernels_flash_attn3         2.33%     101.653us        39.53%       1.727ms       1.727ms       0.000us         0.00%       3.829ms       3.829ms             1  
-                                          FlashAttnFunc         2.05%      89.593us        37.20%       1.625ms     541.619us       0.000us         0.00%       3.829ms       1.276ms             3  
-                        _flash_attn3_48fe103_dirty::fwd         1.17%      51.051us        35.15%       1.535ms     511.754us       2.856ms       100.00%       3.829ms       1.276ms             3  
-                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us       2.858ms       100.06%       2.858ms       2.858ms             1  
-void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       2.856ms       100.00%       2.856ms     952.136us             3  
-                                Activity Buffer Request        32.54%       1.421ms        32.54%       1.421ms       1.421ms     972.574us        34.05%     972.574us     972.574us             1  
-                                            aten::empty         0.62%      27.231us         0.62%      27.231us       4.538us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.12%       5.411us         0.12%       5.411us       1.804us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         0.69%      30.341us         0.69%      30.341us      10.114us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        60.47%       2.642ms        60.47%       2.642ms       2.642ms       0.000us         0.00%       0.000us       0.000us             1  
+                                 hf_kernels_flash_attn3         2.19%      98.331us        39.82%       1.786ms       1.786ms       0.000us         0.00%       3.885ms       3.885ms             1  
+                                          FlashAttnFunc         1.99%      89.333us        37.63%       1.688ms     562.551us       0.000us         0.00%       3.885ms       1.295ms             3  
+                        _flash_attn3_48fe103_dirty::fwd         1.08%      48.311us        35.64%       1.598ms     532.773us       2.912ms       100.00%       3.885ms       1.295ms             3  
+                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us       2.914ms       100.05%       2.914ms       2.914ms             1  
+void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       2.912ms       100.00%       2.912ms     970.802us             3  
+                                Activity Buffer Request        33.18%       1.488ms        33.18%       1.488ms       1.488ms     972.637us        33.40%     972.637us     972.637us             1  
+                                            aten::empty         0.57%      25.370us         0.57%      25.370us       4.228us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.13%       5.730us         0.13%       5.730us       1.910us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         0.69%      30.861us         0.69%      30.861us      10.287us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        60.18%       2.699ms        60.18%       2.699ms       2.699ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 4.368ms
-Self CUDA time total: 2.856ms
+Self CPU time total: 4.485ms
+Self CUDA time total: 2.912ms
 
 
 
@@ -3991,19 +3991,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L384_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                 hf_kernels_flash_attn3         2.61%     122.474us        42.62%       2.001ms       2.001ms       0.000us         0.00%       3.906ms       3.906ms             1  
-                                          FlashAttnFunc         1.99%      93.683us        40.01%       1.879ms     626.332us       0.000us         0.00%       3.906ms       1.302ms             3  
-                        _flash_attn3_48fe103_dirty::fwd         1.17%      54.872us        38.02%       1.785ms     595.104us       2.915ms       100.00%       3.906ms       1.302ms             3  
-                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us       2.917ms       100.05%       2.917ms       2.917ms             1  
-void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       2.915ms       100.00%       2.915ms     971.727us             3  
-                                Activity Buffer Request        31.11%       1.461ms        31.11%       1.461ms       1.461ms     991.129us        34.00%     991.129us     991.129us             1  
-                                            aten::empty         0.59%      27.622us         0.59%      27.622us       4.604us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.12%       5.820us         0.12%       5.820us       1.940us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         5.03%     236.178us         5.03%     236.178us      78.726us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        57.38%       2.695ms        57.38%       2.695ms       2.695ms       0.000us         0.00%       0.000us       0.000us             1  
+                                 hf_kernels_flash_attn3         2.51%     118.553us        41.81%       1.973ms       1.973ms       0.000us         0.00%       3.964ms       3.964ms             1  
+                                          FlashAttnFunc         1.94%      91.662us        39.30%       1.855ms     618.205us       0.000us         0.00%       3.964ms       1.321ms             3  
+                        _flash_attn3_48fe103_dirty::fwd         1.07%      50.373us        37.36%       1.763ms     587.651us       2.962ms       100.00%       3.964ms       1.321ms             3  
+                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us       2.964ms       100.05%       2.964ms       2.964ms             1  
+void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       2.962ms       100.00%       2.962ms     987.401us             3  
+                                Activity Buffer Request        30.92%       1.459ms        30.92%       1.459ms       1.459ms       1.002ms        33.82%       1.002ms       1.002ms             1  
+                                            aten::empty         0.56%      26.451us         0.56%      26.451us       4.408us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.11%       5.270us         0.11%       5.270us       1.757us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         4.70%     221.845us         4.70%     221.845us      73.948us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        58.19%       2.746ms        58.19%       2.746ms       2.746ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 4.696ms
-Self CUDA time total: 2.915ms
+Self CPU time total: 4.719ms
+Self CUDA time total: 2.962ms
 
 
 
@@ -4013,19 +4013,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L448_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                 hf_kernels_flash_attn3         2.45%     124.235us        37.18%       1.882ms       1.882ms       0.000us         0.00%       4.537ms       4.537ms             1  
-                                          FlashAttnFunc         1.83%      92.522us        34.73%       1.758ms     585.897us       0.000us         0.00%       4.537ms       1.512ms             3  
-                        _flash_attn3_48fe103_dirty::fwd         1.03%      52.313us        32.90%       1.665ms     555.056us       3.398ms       100.00%       4.537ms       1.512ms             3  
-                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us       3.399ms       100.05%       3.399ms       3.399ms             1  
-void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       3.398ms       100.00%       3.398ms       1.133ms             3  
-                                Activity Buffer Request        27.82%       1.408ms        27.82%       1.408ms       1.408ms       1.139ms        33.52%       1.139ms       1.139ms             1  
-                                            aten::empty         0.54%      27.441us         0.54%      27.441us       4.573us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.12%       5.839us         0.12%       5.839us       1.946us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         3.39%     171.646us         3.39%     171.646us      57.215us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        62.82%       3.179ms        62.82%       3.179ms       3.179ms       0.000us         0.00%       0.000us       0.000us             1  
+                                 hf_kernels_flash_attn3         2.19%     114.453us        37.34%       1.953ms       1.953ms       0.000us         0.00%       4.662ms       4.662ms             1  
+                                          FlashAttnFunc         1.73%      90.401us        35.15%       1.838ms     612.822us       0.000us         0.00%       4.662ms       1.554ms             3  
+                        _flash_attn3_48fe103_dirty::fwd         0.97%      50.643us        33.42%       1.748ms     582.688us       3.490ms       100.00%       4.662ms       1.554ms             3  
+                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us       3.492ms       100.04%       3.492ms       3.492ms             1  
+void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       3.490ms       100.00%       3.490ms       1.163ms             3  
+                                Activity Buffer Request        28.44%       1.487ms        28.44%       1.487ms       1.487ms       1.171ms        33.56%       1.171ms       1.171ms             1  
+                                            aten::empty         0.52%      27.271us         0.52%      27.271us       4.545us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.09%       4.950us         0.09%       4.950us       1.650us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         3.40%     178.024us         3.40%     178.024us      59.341us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        62.66%       3.277ms        62.66%       3.277ms       3.277ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 5.061ms
-Self CUDA time total: 3.398ms
+Self CPU time total: 5.230ms
+Self CUDA time total: 3.490ms
 
 
 
@@ -4035,33 +4035,33 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L512_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                 hf_kernels_flash_attn3         2.74%     138.223us        36.95%       1.864ms       1.864ms       0.000us         0.00%       4.557ms       4.557ms             1  
-                                          FlashAttnFunc         1.84%      92.725us        34.21%       1.726ms     575.197us       0.000us         0.00%       4.557ms       1.519ms             3  
-                        _flash_attn3_48fe103_dirty::fwd         1.03%      52.171us        32.37%       1.633ms     544.289us       3.424ms       100.00%       4.557ms       1.519ms             3  
-                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us       3.425ms       100.04%       3.425ms       3.425ms             1  
-void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       3.424ms       100.00%       3.424ms       1.141ms             3  
-                                Activity Buffer Request        27.34%       1.379ms        27.34%       1.379ms       1.379ms       1.133ms        33.10%       1.133ms       1.133ms             1  
-                                            aten::empty         0.57%      28.661us         0.57%      28.661us       4.777us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.10%       5.240us         0.10%       5.240us       1.747us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         3.33%     167.776us         3.33%     167.776us      55.925us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        63.05%       3.181ms        63.05%       3.181ms       3.181ms       0.000us         0.00%       0.000us       0.000us             1  
+                                 hf_kernels_flash_attn3         2.26%     115.663us        36.27%       1.854ms       1.854ms       0.000us         0.00%       4.679ms       4.679ms             1  
+                                          FlashAttnFunc         2.25%     114.773us        34.01%       1.738ms     579.364us       0.000us         0.00%       4.679ms       1.560ms             3  
+                        _flash_attn3_48fe103_dirty::fwd         1.02%      51.933us        31.76%       1.623ms     541.107us       3.499ms       100.00%       4.679ms       1.560ms             3  
+                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us       3.500ms       100.04%       3.500ms       3.500ms             1  
+void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       3.499ms       100.00%       3.499ms       1.166ms             3  
+                                Activity Buffer Request        26.80%       1.370ms        26.80%       1.370ms       1.370ms       1.181ms        33.75%       1.181ms       1.181ms             1  
+                                            aten::empty         0.54%      27.681us         0.54%      27.681us       4.613us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.10%       5.079us         0.10%       5.079us       1.693us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         3.30%     168.813us         3.30%     168.813us      56.271us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        63.73%       3.257ms        63.73%       3.257ms       3.257ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 5.045ms
-Self CUDA time total: 3.424ms
+Self CPU time total: 5.111ms
+Self CUDA time total: 3.499ms
 
 
 impl                     wl                  p50(ms)  ok
-hf_kernels_flash_attn3   cuda_attn_L128_bfloat16     0.92  True
-hf_kernels_flash_attn3   cuda_attn_L256_bfloat16     0.96  True
-hf_kernels_flash_attn3   cuda_attn_L320_bfloat16     1.02  True
-hf_kernels_flash_attn3   cuda_attn_L384_bfloat16     1.02  True
-hf_kernels_flash_attn3   cuda_attn_L448_bfloat16     1.18  True
+hf_kernels_flash_attn3   cuda_attn_L128_bfloat16     0.94  True
+hf_kernels_flash_attn3   cuda_attn_L256_bfloat16     0.97  True
+hf_kernels_flash_attn3   cuda_attn_L320_bfloat16     1.04  True
+hf_kernels_flash_attn3   cuda_attn_L384_bfloat16     1.05  True
+hf_kernels_flash_attn3   cuda_attn_L448_bfloat16     1.20  True
 hf_kernels_flash_attn3   cuda_attn_L512_bfloat16     1.18  True
 </pre></div>
 <div class="cell-stderr">
 Fetching 4 files:   0%|          | 0/4 [00:00&lt;?, ?it/s]
-Fetching 4 files:  50%|█████     | 2/4 [00:01&lt;00:01,  1.27it/s]
-Fetching 4 files: 100%|██████████| 4/4 [00:01&lt;00:00,  2.55it/s]
+Fetching 4 files:  50%|█████     | 2/4 [00:01&lt;00:01,  1.38it/s]
+Fetching 4 files: 100%|██████████| 4/4 [00:01&lt;00:00,  2.75it/s]
 </div>
 <div class="cell-artifacts">
 <h4>Artifacts:</h4>
diff --git a/flash_attn/impls/mem_efficient_attention.html b/flash_attn/impls/mem_efficient_attention.html
index 9e30082f387ab5511025b216cc2dd03e743dccac..f6ab4e24cf377304db7fbbedb7a4571918177b17 100644
--- a/flash_attn/impls/mem_efficient_attention.html
+++ b/flash_attn/impls/mem_efficient_attention.html
@@ -3869,9 +3869,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
 <span class="collapse-indicators">
 <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span> 
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
- <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
+ <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> | 
-Cell: benchmark | 32.68s
+Cell: benchmark | 3.92s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3924,28 +3924,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L128_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                          torch_mem_eff         4.77%     340.490us        32.91%       2.350ms       2.350ms       0.000us         0.00%       5.530ms       5.530ms             1  
-                                          torch_mem_eff         0.00%       0.000us         0.00%       0.000us       0.000us       5.523ms       100.81%       5.523ms       5.523ms             1  
-                     aten::scaled_dot_product_attention         0.44%      31.421us         2.67%     190.938us      63.646us       0.000us         0.00%       4.861ms       1.620ms             3  
-          aten::_scaled_dot_product_efficient_attention         0.35%      24.771us         2.23%     159.517us      53.172us       0.000us         0.00%       4.861ms       1.620ms             3  
-                     aten::_efficient_attention_forward         0.51%      36.163us         1.50%     107.413us      35.804us       4.861ms        88.73%       4.861ms       1.620ms             3  
-fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem...         0.00%       0.000us         0.00%       0.000us       0.000us       4.861ms        88.73%       4.861ms       1.620ms             3  
-                                       aten::contiguous         0.17%      12.232us        24.52%       1.751ms     194.525us       0.000us         0.00%     668.128us      74.236us             9  
-                                            aten::clone         0.48%      34.579us        24.35%       1.738ms     193.165us       0.000us         0.00%     668.128us      74.236us             9  
-                                            aten::copy_         1.16%      82.494us        22.79%       1.628ms     180.845us     617.312us        11.27%     668.128us      74.236us             9  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     617.312us        11.27%     617.312us      68.590us             9  
-                                Activity Buffer Request        20.35%       1.453ms        20.35%       1.453ms       1.453ms      50.816us         0.93%      50.816us      50.816us             1  
-                                        aten::transpose         1.00%      71.754us         1.33%      95.065us       3.961us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.33%      23.311us         0.33%      23.311us       0.971us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.27%      19.481us         1.07%      76.301us       8.478us       0.000us         0.00%       0.000us       0.000us             9  
-                                            aten::empty         1.26%      89.759us         1.26%      89.759us       4.274us       0.000us         0.00%       0.000us       0.000us            21  
-                                       cudaLaunchKernel         1.62%     115.656us         1.62%     115.656us       9.638us       0.000us         0.00%       0.000us       0.000us            12  
-                                  cudaStreamIsCapturing         0.04%       2.980us         0.04%       2.980us       0.993us       0.000us         0.00%       0.000us       0.000us             3  
-                                   cudaFuncSetAttribute         0.16%      11.490us         0.16%      11.490us       3.830us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        67.09%       4.790ms        67.09%       4.790ms       4.790ms       0.000us         0.00%       0.000us       0.000us             1  
+                                          torch_mem_eff         4.77%     333.269us        32.71%       2.284ms       2.284ms       0.000us         0.00%       5.420ms       5.420ms             1  
+                                          torch_mem_eff         0.00%       0.000us         0.00%       0.000us       0.000us       5.402ms       100.61%       5.402ms       5.402ms             1  
+                     aten::scaled_dot_product_attention         0.44%      30.450us         2.54%     177.435us      59.145us       0.000us         0.00%       4.753ms       1.584ms             3  
+          aten::_scaled_dot_product_efficient_attention         0.33%      22.722us         2.10%     146.985us      48.995us       0.000us         0.00%       4.753ms       1.584ms             3  
+                     aten::_efficient_attention_forward         0.51%      35.382us         1.42%      99.273us      33.091us       4.753ms        88.51%       4.753ms       1.584ms             3  
+fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem...         0.00%       0.000us         0.00%       0.000us       0.000us       4.753ms        88.51%       4.753ms       1.584ms             3  
+                                       aten::contiguous         0.17%      11.660us        24.51%       1.712ms     190.185us       0.000us         0.00%     667.266us      74.141us             9  
+                                            aten::clone         0.46%      31.810us        24.34%       1.700ms     188.889us       0.000us         0.00%     667.266us      74.141us             9  
+                                            aten::copy_         1.01%      70.871us        22.86%       1.597ms     177.404us     616.738us        11.49%     667.266us      74.141us             9  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     616.738us        11.49%     616.738us      68.526us             9  
+                                Activity Buffer Request        20.64%       1.441ms        20.64%       1.441ms       1.441ms      50.528us         0.94%      50.528us      50.528us             1  
+                                        aten::transpose         0.91%      63.619us         1.25%      87.011us       3.625us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.33%      23.392us         0.33%      23.392us       0.975us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.24%      16.972us         1.02%      71.553us       7.950us       0.000us         0.00%       0.000us       0.000us             9  
+                                            aten::empty         1.18%      82.691us         1.18%      82.691us       3.938us       0.000us         0.00%       0.000us       0.000us            21  
+                                       cudaLaunchKernel         1.55%     108.383us         1.55%     108.383us       9.032us       0.000us         0.00%       0.000us       0.000us            12  
+                                  cudaStreamIsCapturing         0.05%       3.260us         0.05%       3.260us       1.087us       0.000us         0.00%       0.000us       0.000us             3  
+                                   cudaFuncSetAttribute         0.12%       8.450us         0.12%       8.450us       2.817us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        67.29%       4.700ms        67.29%       4.700ms       4.700ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 7.140ms
-Self CUDA time total: 5.479ms
+Self CPU time total: 6.984ms
+Self CUDA time total: 5.369ms
 
 
 
@@ -3955,28 +3955,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L256_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                          torch_mem_eff         3.38%     251.986us        27.98%       2.086ms       2.086ms       0.000us         0.00%       6.014ms       6.014ms             1  
-                                          torch_mem_eff         0.00%       0.000us         0.00%       0.000us       0.000us       5.969ms       100.15%       5.969ms       5.969ms             1  
-                     aten::scaled_dot_product_attention         0.27%      19.962us         1.97%     146.646us      48.882us       0.000us         0.00%       5.323ms       1.774ms             3  
-          aten::_scaled_dot_product_efficient_attention         0.26%      19.141us         1.70%     126.684us      42.228us       0.000us         0.00%       5.323ms       1.774ms             3  
-                     aten::_efficient_attention_forward         0.39%      29.281us         1.12%      83.514us      27.838us       5.323ms        89.32%       5.323ms       1.774ms             3  
-fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem...         0.00%       0.000us         0.00%       0.000us       0.000us       5.323ms        89.32%       5.323ms       1.774ms             3  
-                                       aten::contiguous         0.10%       7.510us        22.05%       1.644ms     182.655us       0.000us         0.00%     690.909us      76.768us             9  
-                                            aten::clone         0.31%      23.251us        21.95%       1.636ms     181.821us       0.000us         0.00%     690.909us      76.768us             9  
-                                            aten::copy_         0.91%      68.131us        20.95%       1.562ms     173.540us     636.478us        10.68%     690.909us      76.768us             9  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     636.478us        10.68%     636.478us      70.720us             9  
-                                Activity Buffer Request        19.09%       1.423ms        19.09%       1.423ms       1.423ms      54.431us         0.91%      54.431us      54.431us             1  
-                                        aten::transpose         0.68%      50.542us         0.90%      67.292us       2.804us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.22%      16.750us         0.22%      16.750us       0.698us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.17%      12.371us         0.69%      51.272us       5.697us       0.000us         0.00%       0.000us       0.000us             9  
-                                            aten::empty         0.87%      64.771us         0.87%      64.771us       3.084us       0.000us         0.00%       0.000us       0.000us            21  
-                                       cudaLaunchKernel         1.25%      93.466us         1.25%      93.466us       7.789us       0.000us         0.00%       0.000us       0.000us            12  
-                                  cudaStreamIsCapturing         0.03%       2.400us         0.03%       2.400us       0.800us       0.000us         0.00%       0.000us       0.000us             3  
-                                   cudaFuncSetAttribute         0.05%       3.371us         0.05%       3.371us       1.124us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        72.02%       5.368ms        72.02%       5.368ms       5.368ms       0.000us         0.00%       0.000us       0.000us             1  
+                                          torch_mem_eff         3.53%     251.015us        29.52%       2.098ms       2.098ms       0.000us         0.00%       5.633ms       5.633ms             1  
+                                          torch_mem_eff         0.00%       0.000us         0.00%       0.000us       0.000us       5.587ms       100.15%       5.587ms       5.587ms             1  
+                     aten::scaled_dot_product_attention         0.25%      17.630us         2.05%     145.594us      48.531us       0.000us         0.00%       4.943ms       1.648ms             3  
+          aten::_scaled_dot_product_efficient_attention         0.28%      19.810us         1.80%     127.964us      42.655us       0.000us         0.00%       4.943ms       1.648ms             3  
+                     aten::_efficient_attention_forward         0.42%      29.862us         1.18%      83.512us      27.837us       4.943ms        88.61%       4.943ms       1.648ms             3  
+fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem...         0.00%       0.000us         0.00%       0.000us       0.000us       4.943ms        88.61%       4.943ms       1.648ms             3  
+                                       aten::contiguous         0.10%       7.191us        23.30%       1.656ms     184.002us       0.000us         0.00%     689.540us      76.616us             9  
+                                            aten::clone         0.33%      23.318us        23.20%       1.649ms     183.203us       0.000us         0.00%     689.540us      76.616us             9  
+                                            aten::copy_         0.92%      65.725us        22.12%       1.572ms     174.717us     635.140us        11.39%     689.540us      76.616us             9  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     635.140us        11.39%     635.140us      70.571us             9  
+                                Activity Buffer Request        20.24%       1.439ms        20.24%       1.439ms       1.439ms      54.400us         0.98%      54.400us      54.400us             1  
+                                        aten::transpose         0.71%      50.494us         0.99%      70.123us       2.922us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.28%      19.629us         0.28%      19.629us       0.818us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.18%      12.608us         0.75%      53.061us       5.896us       0.000us         0.00%       0.000us       0.000us             9  
+                                            aten::empty         0.94%      66.903us         0.94%      66.903us       3.186us       0.000us         0.00%       0.000us       0.000us            21  
+                                       cudaLaunchKernel         1.25%      89.012us         1.25%      89.012us       7.418us       0.000us         0.00%       0.000us       0.000us            12  
+                                  cudaStreamIsCapturing         0.03%       2.220us         0.03%       2.220us       0.740us       0.000us         0.00%       0.000us       0.000us             3  
+                                   cudaFuncSetAttribute         0.05%       3.880us         0.05%       3.880us       1.293us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        70.48%       5.009ms        70.48%       5.009ms       5.009ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 7.454ms
-Self CUDA time total: 5.959ms
+Self CPU time total: 7.107ms
+Self CUDA time total: 5.578ms
 
 
 
@@ -3986,28 +3986,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L320_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                          torch_mem_eff         3.08%     235.490us        27.25%       2.083ms       2.083ms       0.000us         0.00%       6.182ms       6.182ms             1  
-                                          torch_mem_eff         0.00%       0.000us         0.00%       0.000us       0.000us       6.132ms       100.15%       6.132ms       6.132ms             1  
-                     aten::scaled_dot_product_attention         0.24%      18.220us         1.86%     142.046us      47.349us       0.000us         0.00%       5.466ms       1.822ms             3  
-          aten::_scaled_dot_product_efficient_attention         0.24%      18.131us         1.62%     123.826us      41.275us       0.000us         0.00%       5.466ms       1.822ms             3  
-                     aten::_efficient_attention_forward         0.37%      27.940us         1.08%      82.291us      27.430us       5.466ms        89.28%       5.466ms       1.822ms             3  
-fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem...         0.00%       0.000us         0.00%       0.000us       0.000us       5.466ms        89.28%       5.466ms       1.822ms             3  
-                                       aten::contiguous         0.10%       7.272us        21.47%       1.642ms     182.409us       0.000us         0.00%     715.197us      79.466us             9  
-                                            aten::clone         0.29%      22.290us        21.38%       1.634ms     181.601us       0.000us         0.00%     715.197us      79.466us             9  
-                                            aten::copy_         0.83%      63.251us        20.39%       1.559ms     173.182us     656.318us        10.72%     715.197us      79.466us             9  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     656.318us        10.72%     656.318us      72.924us             9  
-                                Activity Buffer Request        18.70%       1.430ms        18.70%       1.430ms       1.430ms      58.879us         0.96%      58.879us      58.879us             1  
-                                        aten::transpose         0.93%      71.209us         1.15%      87.625us       3.651us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.21%      16.416us         0.21%      16.416us       0.684us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.15%      11.741us         0.70%      53.481us       5.942us       0.000us         0.00%       0.000us       0.000us             9  
-                                            aten::empty         0.89%      67.840us         0.89%      67.840us       3.230us       0.000us         0.00%       0.000us       0.000us            21  
-                                       cudaLaunchKernel         1.15%      88.022us         1.15%      88.022us       7.335us       0.000us         0.00%       0.000us       0.000us            12  
-                                  cudaStreamIsCapturing         0.03%       2.651us         0.03%       2.651us       0.884us       0.000us         0.00%       0.000us       0.000us             3  
-                                   cudaFuncSetAttribute         0.04%       3.370us         0.04%       3.370us       1.123us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        72.75%       5.562ms        72.75%       5.562ms       5.562ms       0.000us         0.00%       0.000us       0.000us             1  
+                                          torch_mem_eff         3.28%     246.598us        28.54%       2.146ms       2.146ms       0.000us         0.00%       6.014ms       6.014ms             1  
+                                          torch_mem_eff         0.00%       0.000us         0.00%       0.000us       0.000us       5.967ms       100.18%       5.967ms       5.967ms             1  
+                     aten::scaled_dot_product_attention         0.24%      18.181us         1.92%     144.583us      48.194us       0.000us         0.00%       5.302ms       1.767ms             3  
+          aten::_scaled_dot_product_efficient_attention         0.27%      19.980us         1.68%     126.402us      42.134us       0.000us         0.00%       5.302ms       1.767ms             3  
+                     aten::_efficient_attention_forward         0.38%      28.571us         1.10%      82.521us      27.507us       5.302ms        89.01%       5.302ms       1.767ms             3  
+fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem...         0.00%       0.000us         0.00%       0.000us       0.000us       5.302ms        89.01%       5.302ms       1.767ms             3  
+                                       aten::contiguous         0.09%       6.930us        22.70%       1.707ms     189.666us       0.000us         0.00%     712.547us      79.172us             9  
+                                            aten::clone         0.30%      22.691us        22.61%       1.700ms     188.896us       0.000us         0.00%     712.547us      79.172us             9  
+                                            aten::copy_         1.08%      81.024us        21.57%       1.622ms     180.228us     654.403us        10.99%     712.547us      79.172us             9  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     654.403us        10.99%     654.403us      72.711us             9  
+                                Activity Buffer Request        19.57%       1.471ms        19.57%       1.471ms       1.471ms      58.144us         0.98%      58.144us      58.144us             1  
+                                        aten::transpose         0.68%      51.431us         0.95%      71.351us       2.973us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.26%      19.920us         0.26%      19.920us       0.830us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.16%      11.979us         0.74%      55.320us       6.147us       0.000us         0.00%       0.000us       0.000us             9  
+                                            aten::empty         0.93%      69.561us         0.93%      69.561us       3.312us       0.000us         0.00%       0.000us       0.000us            21  
+                                       cudaLaunchKernel         1.22%      91.652us         1.22%      91.652us       7.638us       0.000us         0.00%       0.000us       0.000us            12  
+                                  cudaStreamIsCapturing         0.03%       2.359us         0.03%       2.359us       0.786us       0.000us         0.00%       0.000us       0.000us             3  
+                                   cudaFuncSetAttribute         0.05%       3.430us         0.05%       3.430us       1.143us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        71.46%       5.373ms        71.46%       5.373ms       5.373ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 7.646ms
-Self CUDA time total: 6.123ms
+Self CPU time total: 7.519ms
+Self CUDA time total: 5.956ms
 
 
 
@@ -4017,28 +4017,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L384_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                          torch_mem_eff         2.84%     224.838us        29.78%       2.354ms       2.354ms       0.000us         0.00%       6.170ms       6.170ms             1  
-                                          torch_mem_eff         0.00%       0.000us         0.00%       0.000us       0.000us       6.121ms       100.15%       6.121ms       6.121ms             1  
-                     aten::scaled_dot_product_attention         0.24%      18.891us         1.82%     143.646us      47.882us       0.000us         0.00%       5.458ms       1.819ms             3  
-          aten::_scaled_dot_product_efficient_attention         0.24%      19.093us         1.58%     124.755us      41.585us       0.000us         0.00%       5.458ms       1.819ms             3  
-                     aten::_efficient_attention_forward         0.36%      28.140us         1.04%      82.213us      27.404us       5.458ms        89.30%       5.458ms       1.819ms             3  
-fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem...         0.00%       0.000us         0.00%       0.000us       0.000us       5.458ms        89.30%       5.458ms       1.819ms             3  
-                                       aten::contiguous         0.10%       7.739us        24.57%       1.942ms     215.806us       0.000us         0.00%     711.998us      79.111us             9  
-                                            aten::clone         0.31%      24.450us        24.47%       1.935ms     214.946us       0.000us         0.00%     711.998us      79.111us             9  
-                                            aten::copy_         0.86%      68.064us        23.51%       1.859ms     206.523us     653.982us        10.70%     711.998us      79.111us             9  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     653.982us        10.70%     653.982us      72.665us             9  
-                                Activity Buffer Request        18.84%       1.489ms        18.84%       1.489ms       1.489ms      58.016us         0.95%      58.016us      58.016us             1  
-                                        aten::transpose         0.62%      49.288us         0.84%      66.489us       2.770us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.22%      17.201us         0.22%      17.201us       0.717us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.15%      12.041us         0.65%      51.362us       5.707us       0.000us         0.00%       0.000us       0.000us             9  
-                                            aten::empty         0.83%      65.351us         0.83%      65.351us       3.112us       0.000us         0.00%       0.000us       0.000us            21  
-                                       cudaLaunchKernel         4.09%     323.234us         4.09%     323.234us      26.936us       0.000us         0.00%       0.000us       0.000us            12  
-                                  cudaStreamIsCapturing         0.03%       2.670us         0.03%       2.670us       0.890us       0.000us         0.00%       0.000us       0.000us             3  
-                                   cudaFuncSetAttribute         0.04%       3.430us         0.04%       3.430us       1.143us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        70.22%       5.551ms        70.22%       5.551ms       5.551ms       0.000us         0.00%       0.000us       0.000us             1  
+                                          torch_mem_eff         3.21%     251.576us        29.97%       2.347ms       2.347ms       0.000us         0.00%       6.116ms       6.116ms             1  
+                                          torch_mem_eff         0.00%       0.000us         0.00%       0.000us       0.000us       6.068ms       100.14%       6.068ms       6.068ms             1  
+                     aten::scaled_dot_product_attention         0.24%      18.800us         1.87%     146.693us      48.898us       0.000us         0.00%       5.408ms       1.803ms             3  
+          aten::_scaled_dot_product_efficient_attention         0.25%      19.900us         1.63%     127.893us      42.631us       0.000us         0.00%       5.408ms       1.803ms             3  
+                     aten::_efficient_attention_forward         0.38%      29.372us         1.07%      83.903us      27.968us       5.408ms        89.25%       5.408ms       1.803ms             3  
+fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem...         0.00%       0.000us         0.00%       0.000us       0.000us       5.408ms        89.25%       5.408ms       1.803ms             3  
+                                       aten::contiguous         0.10%       7.511us        24.29%       1.902ms     211.340us       0.000us         0.00%     708.735us      78.748us             9  
+                                            aten::clone         0.28%      21.872us        24.19%       1.895ms     210.505us       0.000us         0.00%     708.735us      78.748us             9  
+                                            aten::copy_         0.85%      66.540us        23.20%       1.817ms     201.834us     651.551us        10.75%     708.735us      78.748us             9  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     651.551us        10.75%     651.551us      72.395us             9  
+                                Activity Buffer Request        18.68%       1.462ms        18.68%       1.462ms       1.462ms      57.184us         0.94%      57.184us      57.184us             1  
+                                        aten::transpose         0.65%      50.781us         0.90%      70.402us       2.933us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.25%      19.621us         0.25%      19.621us       0.818us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.15%      11.809us         0.72%      56.170us       6.241us       0.000us         0.00%       0.000us       0.000us             9  
+                                            aten::empty         0.90%      70.242us         0.90%      70.242us       3.345us       0.000us         0.00%       0.000us       0.000us            21  
+                                       cudaLaunchKernel         3.97%     310.797us         3.97%     310.797us      25.900us       0.000us         0.00%       0.000us       0.000us            12  
+                                  cudaStreamIsCapturing         0.03%       2.250us         0.03%       2.250us       0.750us       0.000us         0.00%       0.000us       0.000us             3  
+                                   cudaFuncSetAttribute         0.04%       3.220us         0.04%       3.220us       1.073us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        70.03%       5.484ms        70.03%       5.484ms       5.484ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 7.905ms
-Self CUDA time total: 6.112ms
+Self CPU time total: 7.830ms
+Self CUDA time total: 6.059ms
 
 
 
@@ -4048,28 +4048,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L448_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                          torch_mem_eff         2.78%     220.799us        28.42%       2.258ms       2.258ms       0.000us         0.00%       6.296ms       6.296ms             1  
-                                          torch_mem_eff         0.00%       0.000us         0.00%       0.000us       0.000us       6.245ms       100.15%       6.245ms       6.245ms             1  
-                     aten::scaled_dot_product_attention         0.24%      19.311us         1.79%     142.116us      47.372us       0.000us         0.00%       5.574ms       1.858ms             3  
-          aten::_scaled_dot_product_efficient_attention         0.23%      17.909us         1.55%     122.805us      40.935us       0.000us         0.00%       5.574ms       1.858ms             3  
-                     aten::_efficient_attention_forward         0.36%      28.682us         1.03%      82.073us      27.358us       5.574ms        89.39%       5.574ms       1.858ms             3  
-fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem...         0.00%       0.000us         0.00%       0.000us       0.000us       5.574ms        89.39%       5.574ms       1.858ms             3  
-                                       aten::contiguous         0.09%       7.009us        23.32%       1.852ms     205.811us       0.000us         0.00%     721.599us      80.178us             9  
-                                            aten::clone         0.28%      22.450us        23.23%       1.845ms     205.033us       0.000us         0.00%     721.599us      80.178us             9  
-                                            aten::copy_         0.87%      68.713us        22.33%       1.774ms     197.096us     661.695us        10.61%     721.599us      80.178us             9  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     661.695us        10.61%     661.695us      73.522us             9  
-                                Activity Buffer Request        17.91%       1.422ms        17.91%       1.422ms       1.422ms      59.904us         0.96%      59.904us      59.904us             1  
-                                        aten::transpose         0.61%      48.435us         0.82%      65.304us       2.721us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.21%      16.869us         0.21%      16.869us       0.703us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.14%      11.511us         0.62%      48.982us       5.442us       0.000us         0.00%       0.000us       0.000us             9  
-                                            aten::empty         0.78%      61.691us         0.78%      61.691us       2.938us       0.000us         0.00%       0.000us       0.000us            21  
-                                       cudaLaunchKernel         3.85%     305.580us         3.85%     305.580us      25.465us       0.000us         0.00%       0.000us       0.000us            12  
-                                  cudaStreamIsCapturing         0.03%       2.440us         0.03%       2.440us       0.813us       0.000us         0.00%       0.000us       0.000us             3  
-                                   cudaFuncSetAttribute         0.05%       3.920us         0.05%       3.920us       1.307us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        71.58%       5.685ms        71.58%       5.685ms       5.685ms       0.000us         0.00%       0.000us       0.000us             1  
+                                          torch_mem_eff         3.15%     250.575us        28.50%       2.270ms       2.270ms       0.000us         0.00%       6.322ms       6.322ms             1  
+                                          torch_mem_eff         0.00%       0.000us         0.00%       0.000us       0.000us       6.270ms       100.14%       6.270ms       6.270ms             1  
+                     aten::scaled_dot_product_attention         0.22%      17.572us         1.82%     145.084us      48.361us       0.000us         0.00%       5.598ms       1.866ms             3  
+          aten::_scaled_dot_product_efficient_attention         0.24%      19.250us         1.60%     127.512us      42.504us       0.000us         0.00%       5.598ms       1.866ms             3  
+                     aten::_efficient_attention_forward         0.36%      28.812us         1.05%      83.962us      27.987us       5.598ms        89.40%       5.598ms       1.866ms             3  
+fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem...         0.00%       0.000us         0.00%       0.000us       0.000us       5.598ms        89.40%       5.598ms       1.866ms             3  
+                                       aten::contiguous         0.09%       6.912us        22.94%       1.827ms     203.045us       0.000us         0.00%     724.000us      80.444us             9  
+                                            aten::clone         0.28%      21.949us        22.86%       1.820ms     202.277us       0.000us         0.00%     724.000us      80.444us             9  
+                                            aten::copy_         0.82%      65.091us        21.89%       1.744ms     193.745us     664.032us        10.60%     724.000us      80.444us             9  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     664.032us        10.60%     664.032us      73.781us             9  
+                                Activity Buffer Request        18.02%       1.435ms        18.02%       1.435ms       1.435ms      59.968us         0.96%      59.968us      59.968us             1  
+                                        aten::transpose         0.64%      50.930us         0.89%      70.859us       2.952us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.25%      19.929us         0.25%      19.929us       0.830us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.15%      12.022us         0.69%      54.843us       6.094us       0.000us         0.00%       0.000us       0.000us             9  
+                                            aten::empty         0.87%      69.430us         0.87%      69.430us       3.306us       0.000us         0.00%       0.000us       0.000us            21  
+                                       cudaLaunchKernel         3.34%     266.388us         3.34%     266.388us      22.199us       0.000us         0.00%       0.000us       0.000us            12  
+                                  cudaStreamIsCapturing         0.03%       2.320us         0.03%       2.320us       0.773us       0.000us         0.00%       0.000us       0.000us             3  
+                                   cudaFuncSetAttribute         0.04%       3.120us         0.04%       3.120us       1.040us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        71.50%       5.695ms        71.50%       5.695ms       5.695ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 7.943ms
-Self CUDA time total: 6.236ms
+Self CPU time total: 7.965ms
+Self CUDA time total: 6.262ms
 
 
 
@@ -4079,90 +4079,38 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L512_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                          torch_mem_eff         3.27%     267.711us        29.30%       2.401ms       2.401ms       0.000us         0.00%       6.459ms       6.459ms             1  
-                                          torch_mem_eff         0.00%       0.000us         0.00%       0.000us       0.000us       6.406ms       100.13%       6.406ms       6.406ms             1  
-                     aten::scaled_dot_product_attention         0.24%      19.643us         1.85%     151.176us      50.392us       0.000us         0.00%       5.726ms       1.909ms             3  
-          aten::_scaled_dot_product_efficient_attention         0.26%      20.920us         1.61%     131.533us      43.844us       0.000us         0.00%       5.726ms       1.909ms             3  
-                     aten::_efficient_attention_forward         0.37%      30.563us         1.03%      84.603us      28.201us       5.726ms        89.50%       5.726ms       1.909ms             3  
-fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem...         0.00%       0.000us         0.00%       0.000us       0.000us       5.726ms        89.50%       5.726ms       1.909ms             3  
-                                       aten::contiguous         0.09%       7.670us        23.58%       1.932ms     214.647us       0.000us         0.00%     733.247us      81.472us             9  
-                                            aten::clone         0.31%      25.042us        23.48%       1.924ms     213.795us       0.000us         0.00%     733.247us      81.472us             9  
-                                            aten::copy_         0.88%      72.162us        22.52%       1.845ms     205.052us     671.711us        10.50%     733.247us      81.472us             9  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     671.711us        10.50%     671.711us      74.635us             9  
-                                Activity Buffer Request        17.78%       1.456ms        17.78%       1.456ms       1.456ms      61.536us         0.96%      61.536us      61.536us             1  
-                                        aten::transpose         0.71%      58.110us         0.93%      75.842us       3.160us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.22%      17.732us         0.22%      17.732us       0.739us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.15%      12.319us         0.65%      53.641us       5.960us       0.000us         0.00%       0.000us       0.000us             9  
-                                            aten::empty         0.81%      66.513us         0.81%      66.513us       3.167us       0.000us         0.00%       0.000us       0.000us            21  
-                                       cudaLaunchKernel         4.14%     339.159us         4.14%     339.159us      28.263us       0.000us         0.00%       0.000us       0.000us            12  
-                                  cudaStreamIsCapturing         0.03%       2.379us         0.03%       2.379us       0.793us       0.000us         0.00%       0.000us       0.000us             3  
-                                   cudaFuncSetAttribute         0.05%       4.230us         0.05%       4.230us       1.410us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        70.70%       5.793ms        70.70%       5.793ms       5.793ms       0.000us         0.00%       0.000us       0.000us             1  
+                                          torch_mem_eff         3.00%     248.403us        26.98%       2.232ms       2.232ms       0.000us         0.00%       6.668ms       6.668ms             1  
+                                          torch_mem_eff         0.00%       0.000us         0.00%       0.000us       0.000us       6.616ms       100.13%       6.616ms       6.616ms             1  
+                     aten::scaled_dot_product_attention         0.21%      17.221us         1.72%     142.654us      47.551us       0.000us         0.00%       5.939ms       1.980ms             3  
+          aten::_scaled_dot_product_efficient_attention         0.23%      18.779us         1.52%     125.433us      41.811us       0.000us         0.00%       5.939ms       1.980ms             3  
+                     aten::_efficient_attention_forward         0.34%      28.440us         0.99%      81.712us      27.237us       5.939ms        89.88%       5.939ms       1.980ms             3  
+fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem...         0.00%       0.000us         0.00%       0.000us       0.000us       5.939ms        89.88%       5.939ms       1.980ms             3  
+                                       aten::contiguous         0.08%       6.861us        21.66%       1.792ms     199.142us       0.000us         0.00%     729.440us      81.049us             9  
+                                            aten::clone         0.26%      21.352us        21.58%       1.785ms     198.379us       0.000us         0.00%     729.440us      81.049us             9  
+                                            aten::copy_         0.83%      69.012us        20.65%       1.709ms     189.858us     668.928us        10.12%     729.440us      81.049us             9  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     668.928us        10.12%     668.928us      74.325us             9  
+                                Activity Buffer Request        17.29%       1.430ms        17.29%       1.430ms       1.430ms      60.512us         0.92%      60.512us      60.512us             1  
+                                        aten::transpose         0.63%      51.780us         0.89%      73.784us       3.074us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.27%      22.004us         0.27%      22.004us       0.917us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.14%      11.870us         0.67%      55.340us       6.149us       0.000us         0.00%       0.000us       0.000us             9  
+                                            aten::empty         0.84%      69.312us         0.84%      69.312us       3.301us       0.000us         0.00%       0.000us       0.000us            21  
+                                       cudaLaunchKernel         2.79%     231.145us         2.79%     231.145us      19.262us       0.000us         0.00%       0.000us       0.000us            12  
+                                  cudaStreamIsCapturing         0.03%       2.280us         0.03%       2.280us       0.760us       0.000us         0.00%       0.000us       0.000us             3  
+                                   cudaFuncSetAttribute         0.04%       3.570us         0.04%       3.570us       1.190us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        73.02%       6.041ms        73.02%       6.041ms       6.041ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 8.193ms
-Self CUDA time total: 6.398ms
+Self CPU time total: 8.273ms
+Self CUDA time total: 6.608ms
 
 
 impl                     wl                  p50(ms)  ok
-torch_mem_eff            cuda_attn_L128_bfloat16     1.86  True
-torch_mem_eff            cuda_attn_L256_bfloat16     1.97  True
-torch_mem_eff            cuda_attn_L320_bfloat16     2.04  True
-torch_mem_eff            cuda_attn_L384_bfloat16     2.06  True
-torch_mem_eff            cuda_attn_L448_bfloat16     2.03  True
+torch_mem_eff            cuda_attn_L128_bfloat16     1.83  True
+torch_mem_eff            cuda_attn_L256_bfloat16     1.89  True
+torch_mem_eff            cuda_attn_L320_bfloat16     2.00  True
+torch_mem_eff            cuda_attn_L384_bfloat16     1.97  True
+torch_mem_eff            cuda_attn_L448_bfloat16     2.06  True
 torch_mem_eff            cuda_attn_L512_bfloat16     2.19  True
 </pre></div>
-<div class="uv-install-logs" id="uv-logs-benchmark">
-<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
-<div class="uv-logs-content" style="display: none;">
-   Building kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
-Downloading networkx (1.9MiB)
-Downloading matplotlib (8.3MiB)
-Downloading nvidia-cufft-cu12 (184.2MiB)
-Downloading sympy (6.0MiB)
-Downloading nvidia-cusolver-cu12 (255.1MiB)
-Downloading nvidia-cublas-cu12 (566.8MiB)
-Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
-Downloading numpy (16.2MiB)
-Downloading nvidia-nvjitlink-cu12 (37.4MiB)
-Downloading setuptools (1.1MiB)
-Downloading nvidia-cudnn-cu12 (674.0MiB)
-Downloading nvidia-curand-cu12 (60.7MiB)
-Downloading nvidia-nccl-cu12 (307.4MiB)
-Downloading kiwisolver (1.4MiB)
-Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
-Downloading fonttools (4.7MiB)
-Downloading nvidia-cusparse-cu12 (274.9MiB)
-Downloading pillow (6.7MiB)
-Downloading nvidia-cusparselt-cu12 (273.9MiB)
-Downloading nvidia-cufile-cu12 (1.1MiB)
-Downloading triton (148.3MiB)
-Downloading torch (846.9MiB)
- Downloading nvidia-cufile-cu12
- Downloading kiwisolver
- Downloading setuptools
- Downloading fonttools
- Downloading networkx
- Downloading pillow
-      Built kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
- Downloading nvidia-cuda-cupti-cu12
- Downloading matplotlib
- Downloading numpy
- Downloading sympy
- Downloading nvidia-nvjitlink-cu12
- Downloading nvidia-curand-cu12
- Downloading nvidia-cuda-nvrtc-cu12
- Downloading triton
- Downloading nvidia-cufft-cu12
- Downloading nvidia-cusolver-cu12
- Downloading nvidia-cusparse-cu12
- Downloading nvidia-cusparselt-cu12
- Downloading nvidia-nccl-cu12
- Downloading nvidia-cublas-cu12
- Downloading nvidia-cudnn-cu12
- Downloading torch
-Installed 37 packages in 216ms
-</div>
-</div>
 <div class="cell-artifacts">
 <h4>Artifacts:</h4>
 <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
diff --git a/flash_attn/impls/sage_attention.html b/flash_attn/impls/sage_attention.html
index 573b17feaed54b9320d6ff8e360dfa03da8f3be9..9d07a2ce157ec6414ddbe4c27bea52ef7ed253b0 100644
--- a/flash_attn/impls/sage_attention.html
+++ b/flash_attn/impls/sage_attention.html
@@ -3869,9 +3869,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
 <span class="collapse-indicators">
 <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span> 
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
- <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
+ <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
 </span> | 
-Cell: benchmark | 4.22s
+Cell: benchmark | 4.53s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3920,23 +3920,28 @@ Cell: benchmark | 4.22s
 <div class="cell-stdout"><pre class="stdout-text">Running attention benchmark on cuda with 6 workloads.
 impl                     wl                  p50(ms)  ok
 sage_int8_fp16           cuda_attn_L128_bfloat16    FAIL  False
-  Error: module &#x27;sage_attention_fd11035eb4318b27&#x27; has no attribute &#x27;fwd&#x27;
+  Error: module &#x27;sage_attention_5c963cbdaf16559b&#x27; has no attribute &#x27;fwd&#x27;
 sage_int8_fp16           cuda_attn_L256_bfloat16    FAIL  False
-  Error: module &#x27;sage_attention_fd11035eb4318b27&#x27; has no attribute &#x27;fwd&#x27;
+  Error: module &#x27;sage_attention_5c963cbdaf16559b&#x27; has no attribute &#x27;fwd&#x27;
 sage_int8_fp16           cuda_attn_L320_bfloat16    FAIL  False
-  Error: module &#x27;sage_attention_fd11035eb4318b27&#x27; has no attribute &#x27;fwd&#x27;
+  Error: module &#x27;sage_attention_5c963cbdaf16559b&#x27; has no attribute &#x27;fwd&#x27;
 sage_int8_fp16           cuda_attn_L384_bfloat16    FAIL  False
-  Error: module &#x27;sage_attention_fd11035eb4318b27&#x27; has no attribute &#x27;fwd&#x27;
+  Error: module &#x27;sage_attention_5c963cbdaf16559b&#x27; has no attribute &#x27;fwd&#x27;
 sage_int8_fp16           cuda_attn_L448_bfloat16    FAIL  False
-  Error: module &#x27;sage_attention_fd11035eb4318b27&#x27; has no attribute &#x27;fwd&#x27;
+  Error: module &#x27;sage_attention_5c963cbdaf16559b&#x27; has no attribute &#x27;fwd&#x27;
 sage_int8_fp16           cuda_attn_L512_bfloat16    FAIL  False
-  Error: module &#x27;sage_attention_fd11035eb4318b27&#x27; has no attribute &#x27;fwd&#x27;
+  Error: module &#x27;sage_attention_5c963cbdaf16559b&#x27; has no attribute &#x27;fwd&#x27;
 </pre></div>
-<div class="cell-stderr">
-Fetching 11 files:   0%|          | 0/11 [00:00&lt;?, ?it/s]
-Fetching 11 files:  73%|███████▎  | 8/11 [00:00&lt;00:00, 13.92it/s]
-Fetching 11 files: 100%|██████████| 11/11 [00:00&lt;00:00, 19.13it/s]
+<div class="uv-install-logs" id="uv-logs-benchmark">
+<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
+<div class="uv-logs-content" style="display: none;">
+Installed 15 packages in 14ms
 </div>
+</div>
+<div class="cell-stderr">Fetching 11 files:   0%|          | 0/11 [00:00&lt;?, ?it/s]
+Fetching 11 files:  18%|█▊        | 2/11 [00:00&lt;00:00, 15.79it/s]
+Fetching 11 files:  73%|███████▎  | 8/11 [00:00&lt;00:00, 13.55it/s]
+Fetching 11 files: 100%|██████████| 11/11 [00:00&lt;00:00, 18.83it/s]</div>
 <div class="cell-artifacts">
 <h4>Artifacts:</h4>
 <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
diff --git a/flash_attn/impls/xformers.html b/flash_attn/impls/xformers.html
index 2dadff0b53907b1426c870df5e01dac812507a43..6363e024de1afb10cb31713f99cf844d998ebe90 100644
--- a/flash_attn/impls/xformers.html
+++ b/flash_attn/impls/xformers.html
@@ -3923,21 +3923,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L128_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                          xformers_meff         9.93%     451.937us        49.71%       2.262ms       2.262ms       0.000us         0.00%       3.695ms       3.695ms             1  
-                             xformers_flash3::flash_fwd         4.26%     193.656us        38.96%       1.773ms     590.904us       0.000us         0.00%       3.695ms       1.232ms             3  
-                                      flash_attn_3::fwd         1.62%      73.841us        34.71%       1.579ms     526.352us       2.795ms       100.00%       3.695ms       1.232ms             3  
-                                          xformers_meff         0.00%       0.000us         0.00%       0.000us       0.000us       2.797ms       100.05%       2.797ms       2.797ms             1  
-void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       2.795ms       100.00%       2.795ms     931.773us             3  
-                                Activity Buffer Request        31.17%       1.418ms        31.17%       1.418ms       1.418ms     899.421us        32.18%     899.421us     899.421us             1  
-                                            aten::empty         0.76%      34.741us         0.76%      34.741us       5.790us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.30%      13.732us         0.30%      13.732us       4.577us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         0.85%      38.662us         0.85%      38.662us      12.887us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::reshape         0.35%      15.860us         0.82%      37.181us       6.197us       0.000us         0.00%       0.000us       0.000us             6  
-                                             aten::view         0.47%      21.321us         0.47%      21.321us       3.553us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        50.29%       2.288ms        50.29%       2.288ms       2.288ms       0.000us         0.00%       0.000us       0.000us             1  
+                                          xformers_meff        10.85%     481.112us        51.55%       2.285ms       2.285ms       0.000us         0.00%       3.582ms       3.582ms             1  
+                             xformers_flash3::flash_fwd         4.56%     202.185us        39.85%       1.766ms     588.715us       0.000us         0.00%       3.582ms       1.194ms             3  
+                                      flash_attn_3::fwd         1.68%      74.662us        35.29%       1.564ms     521.320us       2.681ms       100.00%       3.582ms       1.194ms             3  
+                                          xformers_meff         0.00%       0.000us         0.00%       0.000us       0.000us       2.682ms       100.06%       2.682ms       2.682ms             1  
+void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       2.681ms       100.00%       2.681ms     893.515us             3  
+                                Activity Buffer Request        31.74%       1.407ms        31.74%       1.407ms       1.407ms     901.761us        33.64%     901.761us     901.761us             1  
+                                            aten::empty         0.77%      33.920us         0.77%      33.920us       5.653us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.23%      10.152us         0.23%      10.152us       3.384us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         0.87%      38.521us         0.87%      38.521us      12.840us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::reshape         0.29%      13.028us         0.85%      37.710us       6.285us       0.000us         0.00%       0.000us       0.000us             6  
+                                             aten::view         0.56%      24.682us         0.56%      24.682us       4.114us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        48.45%       2.147ms        48.45%       2.147ms       2.147ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 4.550ms
-Self CUDA time total: 2.795ms
+Self CPU time total: 4.432ms
+Self CUDA time total: 2.681ms
 
 
 
@@ -3947,21 +3947,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L256_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                          xformers_meff         6.95%     312.321us        44.96%       2.021ms       2.021ms       0.000us         0.00%       3.832ms       3.832ms             1  
-                             xformers_flash3::flash_fwd         3.14%     141.315us        37.51%       1.686ms     561.970us       0.000us         0.00%       3.832ms       1.277ms             3  
-                                      flash_attn_3::fwd         1.18%      53.030us        34.37%       1.545ms     514.865us       2.890ms       100.00%       3.832ms       1.277ms             3  
-                                          xformers_meff         0.00%       0.000us         0.00%       0.000us       0.000us       2.892ms       100.05%       2.892ms       2.892ms             1  
-void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       2.890ms       100.00%       2.890ms     963.329us             3  
-                                Activity Buffer Request        31.64%       1.422ms        31.64%       1.422ms       1.422ms     942.465us        32.61%     942.465us     942.465us             1  
-                                            aten::empty         0.68%      30.660us         0.68%      30.660us       5.110us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.12%       5.592us         0.12%       5.592us       1.864us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         0.74%      33.432us         0.74%      33.432us      11.144us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::reshape         0.20%       8.951us         0.50%      22.691us       3.782us       0.000us         0.00%       0.000us       0.000us             6  
-                                             aten::view         0.31%      13.740us         0.31%      13.740us       2.290us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        55.04%       2.474ms        55.04%       2.474ms       2.474ms       0.000us         0.00%       0.000us       0.000us             1  
+                                          xformers_meff         7.16%     317.438us        45.96%       2.036ms       2.036ms       0.000us         0.00%       3.779ms       3.779ms             1  
+                             xformers_flash3::flash_fwd         3.35%     148.243us        38.25%       1.695ms     564.991us       0.000us         0.00%       3.779ms       1.260ms             3  
+                                      flash_attn_3::fwd         1.25%      55.403us        34.91%       1.547ms     515.576us       2.825ms       100.00%       3.779ms       1.260ms             3  
+                                          xformers_meff         0.00%       0.000us         0.00%       0.000us       0.000us       2.827ms       100.05%       2.827ms       2.827ms             1  
+void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       2.825ms       100.00%       2.825ms     941.739us             3  
+                                Activity Buffer Request        32.14%       1.424ms        32.14%       1.424ms       1.424ms     954.080us        33.77%     954.080us     954.080us             1  
+                                            aten::empty         0.63%      27.720us         0.63%      27.720us       4.620us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.12%       5.400us         0.12%       5.400us       1.800us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         0.77%      34.161us         0.77%      34.161us      11.387us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::reshape         0.21%       9.370us         0.54%      23.750us       3.958us       0.000us         0.00%       0.000us       0.000us             6  
+                                             aten::view         0.32%      14.380us         0.32%      14.380us       2.397us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        54.04%       2.395ms        54.04%       2.395ms       2.395ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 4.495ms
-Self CUDA time total: 2.890ms
+Self CPU time total: 4.431ms
+Self CUDA time total: 2.825ms
 
 
 
@@ -3971,21 +3971,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L320_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                          xformers_meff         6.65%     298.008us        44.73%       2.006ms       2.006ms       0.000us         0.00%       3.867ms       3.867ms             1  
-                             xformers_flash3::flash_fwd         3.15%     141.235us        37.58%       1.685ms     561.690us       0.000us         0.00%       3.867ms       1.289ms             3  
-                                      flash_attn_3::fwd         1.18%      53.120us        34.43%       1.544ms     514.611us       2.888ms       100.00%       3.867ms       1.289ms             3  
-                                          xformers_meff         0.00%       0.000us         0.00%       0.000us       0.000us       2.890ms       100.06%       2.890ms       2.890ms             1  
-void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       2.888ms       100.00%       2.888ms     962.683us             3  
-                                Activity Buffer Request        31.72%       1.422ms        31.72%       1.422ms       1.422ms     978.939us        33.90%     978.939us     978.939us             1  
-                                            aten::empty         0.67%      30.192us         0.67%      30.192us       5.032us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.12%       5.491us         0.12%       5.491us       1.830us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         0.73%      32.901us         0.73%      32.901us      10.967us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::reshape         0.20%       8.773us         0.50%      22.603us       3.767us       0.000us         0.00%       0.000us       0.000us             6  
-                                             aten::view         0.31%      13.830us         0.31%      13.830us       2.305us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        55.27%       2.478ms        55.27%       2.478ms       2.478ms       0.000us         0.00%       0.000us       0.000us             1  
+                                          xformers_meff         6.87%     310.027us        44.72%       2.018ms       2.018ms       0.000us         0.00%       3.923ms       3.923ms             1  
+                             xformers_flash3::flash_fwd         3.22%     145.444us        37.33%       1.684ms     561.324us       0.000us         0.00%       3.923ms       1.308ms             3  
+                                      flash_attn_3::fwd         1.15%      52.002us        34.10%       1.539ms     512.843us       2.919ms       100.00%       3.923ms       1.308ms             3  
+                                          xformers_meff         0.00%       0.000us         0.00%       0.000us       0.000us       2.921ms       100.06%       2.921ms       2.921ms             1  
+void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       2.919ms       100.00%       2.919ms     973.037us             3  
+                                Activity Buffer Request        31.44%       1.418ms        31.44%       1.418ms       1.418ms       1.004ms        34.40%       1.004ms       1.004ms             1  
+                                            aten::empty         0.63%      28.392us         0.63%      28.392us       4.732us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.12%       5.520us         0.12%       5.520us       1.840us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         0.76%      34.420us         0.76%      34.420us      11.473us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::reshape         0.21%       9.519us         0.52%      23.650us       3.942us       0.000us         0.00%       0.000us       0.000us             6  
+                                             aten::view         0.31%      14.131us         0.31%      14.131us       2.355us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        55.28%       2.494ms        55.28%       2.494ms       2.494ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 4.484ms
-Self CUDA time total: 2.888ms
+Self CPU time total: 4.511ms
+Self CUDA time total: 2.919ms
 
 
 
@@ -3995,21 +3995,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L384_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                          xformers_meff         6.31%     299.042us        46.56%       2.205ms       2.205ms       0.000us         0.00%       3.936ms       3.936ms             1  
-                             xformers_flash3::flash_fwd         2.97%     140.784us        39.75%       1.883ms     627.609us       0.000us         0.00%       3.936ms       1.312ms             3  
-                                      flash_attn_3::fwd         1.10%      52.191us        36.78%       1.742ms     580.681us       2.941ms       100.00%       3.936ms       1.312ms             3  
-                                          xformers_meff         0.00%       0.000us         0.00%       0.000us       0.000us       2.943ms       100.05%       2.943ms       2.943ms             1  
-void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       2.941ms       100.00%       2.941ms     980.445us             3  
-                                Activity Buffer Request        30.11%       1.426ms        30.11%       1.426ms       1.426ms     994.973us        33.83%     994.973us     994.973us             1  
-                                            aten::empty         0.64%      30.333us         0.64%      30.333us       5.055us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.11%       5.440us         0.11%       5.440us       1.813us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         4.81%     227.898us         4.81%     227.898us      75.966us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::reshape         0.19%       8.769us         0.49%      23.220us       3.870us       0.000us         0.00%       0.000us       0.000us             6  
-                                             aten::view         0.31%      14.451us         0.31%      14.451us       2.409us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        53.44%       2.531ms        53.44%       2.531ms       2.531ms       0.000us         0.00%       0.000us       0.000us             1  
+                                          xformers_meff         6.73%     317.798us        47.46%       2.241ms       2.241ms       0.000us         0.00%       3.892ms       3.892ms             1  
+                             xformers_flash3::flash_fwd         3.10%     146.544us        40.23%       1.900ms     633.169us       0.000us         0.00%       3.892ms       1.297ms             3  
+                                      flash_attn_3::fwd         1.15%      54.462us        37.13%       1.753ms     584.321us       2.910ms       100.00%       3.892ms       1.297ms             3  
+                                          xformers_meff         0.00%       0.000us         0.00%       0.000us       0.000us       2.911ms       100.05%       2.911ms       2.911ms             1  
+void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       2.910ms       100.00%       2.910ms     969.848us             3  
+                                Activity Buffer Request        30.01%       1.417ms        30.01%       1.417ms       1.417ms     982.915us        33.78%     982.915us     982.915us             1  
+                                            aten::empty         0.62%      29.170us         0.62%      29.170us       4.862us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.11%       5.370us         0.11%       5.370us       1.790us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         5.23%     247.156us         5.23%     247.156us      82.385us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::reshape         0.20%       9.560us         0.50%      23.460us       3.910us       0.000us         0.00%       0.000us       0.000us             6  
+                                             aten::view         0.29%      13.900us         0.29%      13.900us       2.317us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        52.54%       2.481ms        52.54%       2.481ms       2.481ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 4.736ms
-Self CUDA time total: 2.941ms
+Self CPU time total: 4.721ms
+Self CUDA time total: 2.910ms
 
 
 
@@ -4019,21 +4019,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L448_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                          xformers_meff         5.82%     299.962us        41.73%       2.152ms       2.152ms       0.000us         0.00%       4.566ms       4.566ms             1  
-                             xformers_flash3::flash_fwd         2.76%     142.114us        35.47%       1.829ms     609.751us       0.000us         0.00%       4.566ms       1.522ms             3  
-                                      flash_attn_3::fwd         1.04%      53.631us        32.71%       1.687ms     562.380us       3.419ms       100.00%       4.566ms       1.522ms             3  
-                                          xformers_meff         0.00%       0.000us         0.00%       0.000us       0.000us       3.420ms       100.05%       3.420ms       3.420ms             1  
-void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       3.419ms       100.00%       3.419ms       1.140ms             3  
-                                Activity Buffer Request        27.56%       1.422ms        27.56%       1.422ms       1.422ms       1.148ms        33.58%       1.148ms       1.148ms             1  
-                                            aten::empty         0.60%      31.172us         0.60%      31.172us       5.195us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.11%       5.431us         0.11%       5.431us       1.810us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         3.40%     175.366us         3.40%     175.366us      58.455us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::reshape         0.17%       8.849us         0.45%      23.030us       3.838us       0.000us         0.00%       0.000us       0.000us             6  
-                                             aten::view         0.27%      14.181us         0.27%      14.181us       2.363us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        58.27%       3.005ms        58.27%       3.005ms       3.005ms       0.000us         0.00%       0.000us       0.000us             1  
+                                          xformers_meff         5.86%     306.369us        41.94%       2.193ms       2.193ms       0.000us         0.00%       4.614ms       4.614ms             1  
+                             xformers_flash3::flash_fwd         2.85%     149.202us        35.63%       1.863ms     620.885us       0.000us         0.00%       4.614ms       1.538ms             3  
+                                      flash_attn_3::fwd         1.03%      53.951us        32.77%       1.713ms     571.151us       3.461ms       100.00%       4.614ms       1.538ms             3  
+                                          xformers_meff         0.00%       0.000us         0.00%       0.000us       0.000us       3.462ms       100.04%       3.462ms       3.462ms             1  
+void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       3.461ms       100.00%       3.461ms       1.154ms             3  
+                                Activity Buffer Request        27.28%       1.426ms        27.28%       1.426ms       1.426ms       1.153ms        33.31%       1.153ms       1.153ms             1  
+                                            aten::empty         0.55%      28.813us         0.55%      28.813us       4.802us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.11%       5.560us         0.11%       5.560us       1.853us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         3.80%     198.684us         3.80%     198.684us      66.228us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::reshape         0.18%       9.430us         0.46%      23.930us       3.988us       0.000us         0.00%       0.000us       0.000us             6  
+                                             aten::view         0.28%      14.500us         0.28%      14.500us       2.417us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        58.06%       3.036ms        58.06%       3.036ms       3.036ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 5.157ms
-Self CUDA time total: 3.419ms
+Self CPU time total: 5.228ms
+Self CUDA time total: 3.461ms
 
 
 
@@ -4043,37 +4043,37 @@ PROFILE TRACE: xformers_meff | cuda_attn_L512_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                          xformers_meff         5.76%     295.800us        41.67%       2.139ms       2.139ms       0.000us         0.00%       4.557ms       4.557ms             1  
-                             xformers_flash3::flash_fwd         2.75%     141.044us        35.47%       1.821ms     606.924us       0.000us         0.00%       4.557ms       1.519ms             3  
-                                      flash_attn_3::fwd         1.04%      53.523us        32.72%       1.680ms     559.910us       3.405ms       100.00%       4.557ms       1.519ms             3  
-                                          xformers_meff         0.00%       0.000us         0.00%       0.000us       0.000us       3.407ms       100.05%       3.407ms       3.407ms             1  
-void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       3.405ms       100.00%       3.405ms       1.135ms             3  
-                                Activity Buffer Request        27.67%       1.420ms        27.67%       1.420ms       1.420ms       1.152ms        33.82%       1.152ms       1.152ms             1  
-                                            aten::empty         0.60%      30.610us         0.60%      30.610us       5.102us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.12%       6.310us         0.12%       6.310us       2.103us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         3.29%     168.946us         3.29%     168.946us      56.315us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::reshape         0.17%       8.721us         0.44%      22.392us       3.732us       0.000us         0.00%       0.000us       0.000us             6  
-                                             aten::view         0.27%      13.671us         0.27%      13.671us       2.279us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        58.33%       2.994ms        58.33%       2.994ms       2.994ms       0.000us         0.00%       0.000us       0.000us             1  
+                                          xformers_meff         5.96%     310.158us        41.66%       2.167ms       2.167ms       0.000us         0.00%       4.643ms       4.643ms             1  
+                             xformers_flash3::flash_fwd         2.83%     146.954us        35.22%       1.832ms     610.728us       0.000us         0.00%       4.643ms       1.548ms             3  
+                                      flash_attn_3::fwd         1.00%      51.911us        32.40%       1.685ms     561.744us       3.464ms       100.00%       4.643ms       1.548ms             3  
+                                          xformers_meff         0.00%       0.000us         0.00%       0.000us       0.000us       3.465ms       100.04%       3.465ms       3.465ms             1  
+void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       3.464ms       100.00%       3.464ms       1.155ms             3  
+                                Activity Buffer Request        27.49%       1.430ms        27.49%       1.430ms       1.430ms       1.179ms        34.05%       1.179ms       1.179ms             1  
+                                            aten::empty         0.54%      28.311us         0.54%      28.311us       4.719us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.11%       5.750us         0.11%       5.750us       1.917us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         3.25%     169.084us         3.25%     169.084us      56.361us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::reshape         0.17%       8.670us         0.48%      24.720us       4.120us       0.000us         0.00%       0.000us       0.000us             6  
+                                             aten::view         0.31%      16.050us         0.31%      16.050us       2.675us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        58.34%       3.035ms        58.34%       3.035ms       3.035ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 5.133ms
-Self CUDA time total: 3.405ms
+Self CPU time total: 5.202ms
+Self CUDA time total: 3.464ms
 
 
 impl                     wl                  p50(ms)  ok
-xformers_meff            cuda_attn_L128_bfloat16     0.98  True
+xformers_meff            cuda_attn_L128_bfloat16     1.00  True
 xformers_meff            cuda_attn_L256_bfloat16     1.03  True
 xformers_meff            cuda_attn_L320_bfloat16     1.08  True
-xformers_meff            cuda_attn_L384_bfloat16     1.10  True
-xformers_meff            cuda_attn_L448_bfloat16     1.23  True
-xformers_meff            cuda_attn_L512_bfloat16     1.22  True
+xformers_meff            cuda_attn_L384_bfloat16     1.09  True
+xformers_meff            cuda_attn_L448_bfloat16     1.25  True
+xformers_meff            cuda_attn_L512_bfloat16     1.24  True
 </pre></div>
 <div class="uv-install-logs" id="uv-logs-benchmark">
 <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 <div class="uv-logs-content" style="display: none;">
 Downloading xformers (111.8MiB)
  Downloading xformers
-Installed 1 package in 14ms
+Installed 1 package in 13ms
 </div>
 </div>
 <div class="cell-artifacts">
diff --git a/flash_attn/results/artifacts/combine/latency.svg b/flash_attn/results/artifacts/combine/latency.svg
index 689e04d1be57a1e800f341bc84fe4bfaf1387666..19f0903d77a8fb32c0a3ed03553c82706371801e 100644
--- a/flash_attn/results/artifacts/combine/latency.svg
+++ b/flash_attn/results/artifacts/combine/latency.svg
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:168c229932ad06a68508a4a77b66485ff9bcf48ed736a5ffdd003f5cb9e8e639
-size 24777
+oid sha256:0a7d7b3dc8fc6b60a4b9f8bfcf3e229706548b71a8174822b89cc9a2746d3bbd
+size 24787
diff --git a/flash_attn/results/combined_results.html b/flash_attn/results/combined_results.html
index 7a5f09ca394f53e1d971ad7b608a69d09750ab95..3a2204532e0ec8ef3588194f5c38935fb60f8208 100644
--- a/flash_attn/results/combined_results.html
+++ b/flash_attn/results/combined_results.html
@@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
   <rdf:RDF>
    <ns2:Work>
     <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
-    <dc:date>2025-10-28T14:09:17.505622</dc:date>
+    <dc:date>2025-10-29T14:28:03.109695</dc:date>
     <dc:format>image/svg+xml</dc:format>
     <dc:creator>
      <ns2:Agent>
@@ -3982,96 +3982,96 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
    <g id="matplotlib.axis_2">
     <g id="ytick_1">
      <g id="grid-y--2" class="grid grid-y">
-      <path d="M 47.81 403.521712  L 835.361742 403.521712  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.81 409.00723  L 835.361742 409.00723  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_7">
       <defs>
        <path id="m0fca2865ba" d="M 0 0  L -3.5 0  " style="stroke: #000000; stroke-width: 0.8" />
       </defs>
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.81" y="403.521712" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.81" y="409.00723" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_7">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="407.320931" transform="rotate(-0 40.81 407.320931)">1.0</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="412.806448" transform="rotate(-0 40.81 412.806448)">1.0</text>
      </g>
     </g>
     <g id="ytick_2">
      <g id="grid-y--3" class="grid grid-y">
-      <path d="M 47.81 343.523424  L 835.361742 343.523424  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.81 347.973099  L 835.361742 347.973099  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_8">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.81" y="343.523424" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.81" y="347.973099" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_8">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="347.322643" transform="rotate(-0 40.81 347.322643)">1.2</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="351.772318" transform="rotate(-0 40.81 351.772318)">1.2</text>
      </g>
     </g>
     <g id="ytick_3">
      <g id="grid-y--4" class="grid grid-y">
-      <path d="M 47.81 283.525136  L 835.361742 283.525136  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.81 286.938969  L 835.361742 286.938969  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_9">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.81" y="283.525136" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.81" y="286.938969" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_9">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="287.324355" transform="rotate(-0 40.81 287.324355)">1.4</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="290.738187" transform="rotate(-0 40.81 290.738187)">1.4</text>
      </g>
     </g>
     <g id="ytick_4">
      <g id="grid-y--5" class="grid grid-y">
-      <path d="M 47.81 223.526848  L 835.361742 223.526848  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.81 225.904838  L 835.361742 225.904838  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_10">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.81" y="223.526848" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.81" y="225.904838" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_10">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="227.326067" transform="rotate(-0 40.81 227.326067)">1.6</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="229.704057" transform="rotate(-0 40.81 229.704057)">1.6</text>
      </g>
     </g>
     <g id="ytick_5">
      <g id="grid-y--6" class="grid grid-y">
-      <path d="M 47.81 163.52856  L 835.361742 163.52856  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.81 164.870708  L 835.361742 164.870708  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_11">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.81" y="163.52856" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.81" y="164.870708" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_11">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="167.327779" transform="rotate(-0 40.81 167.327779)">1.8</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="168.669926" transform="rotate(-0 40.81 168.669926)">1.8</text>
      </g>
     </g>
     <g id="ytick_6">
      <g id="grid-y--7" class="grid grid-y">
-      <path d="M 47.81 103.530273  L 835.361742 103.530273  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.81 103.836577  L 835.361742 103.836577  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_12">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.81" y="103.530273" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.81" y="103.836577" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_12">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="107.329491" transform="rotate(-0 40.81 107.329491)">2.0</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="107.635796" transform="rotate(-0 40.81 107.635796)">2.0</text>
      </g>
     </g>
     <g id="ytick_7">
      <g id="grid-y--8" class="grid grid-y">
-      <path d="M 47.81 43.531985  L 835.361742 43.531985  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.81 42.802447  L 835.361742 42.802447  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_13">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.81" y="43.531985" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.81" y="42.802447" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_13">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="47.331204" transform="rotate(-0 40.81 47.331204)">2.2</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="46.601665" transform="rotate(-0 40.81 46.601665)">2.2</text>
      </g>
     </g>
     <g id="label--y" class="ylabel">
@@ -4079,73 +4079,73 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
     </g>
    </g>
    <g id="series--torch-flash-ma" class="series">
-    <path d="M 83.607806 337.456697  L 226.799032 322.330829  L 369.990258 318.592935  L 513.181484 311.965825  L 656.37271 262.663131  L 799.563935 254.692359  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 83.607806 340.639848  L 226.799032 324.181385  L 369.990258 320.559009  L 513.181484 308.901185  L 656.37271 265.282228  L 799.563935 254.967155  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="md7efaf3aec" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #1f77b4" />
     </defs>
     <g clip-path="url(#p09feef2583)">
-     <use ns4:href="#md7efaf3aec" x="83.607806" y="337.456697" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="226.799032" y="322.330829" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="369.990258" y="318.592935" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="513.181484" y="311.965825" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="656.37271" y="262.663131" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="799.563935" y="254.692359" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="83.607806" y="340.639848" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="226.799032" y="324.181385" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="369.990258" y="320.559009" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="513.181484" y="308.901185" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="656.37271" y="265.282228" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="799.563935" y="254.967155" style="fill: #1f77b4; stroke: #1f77b4" />
     </g>
    </g>
    <g id="series--torch-mem-eff" class="series">
-    <path d="M 83.607806 144.033917  L 226.799032 111.747638  L 369.990258 92.42159  L 513.181484 85.353791  L 656.37271 94.728524  L 799.563935 45.999414  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 83.607806 156.748591  L 226.799032 137.315018  L 369.990258 105.143013  L 513.181484 114.228248  L 656.37271 86.655469  L 799.563935 45.999414  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="m9b8c54d372" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #ff7f0e" />
     </defs>
     <g clip-path="url(#p09feef2583)">
-     <use ns4:href="#m9b8c54d372" x="83.607806" y="144.033917" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="226.799032" y="111.747638" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="369.990258" y="92.42159" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="513.181484" y="85.353791" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="656.37271" y="94.728524" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="83.607806" y="156.748591" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="226.799032" y="137.315018" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="369.990258" y="105.143013" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="513.181484" y="114.228248" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="656.37271" y="86.655469" style="fill: #ff7f0e; stroke: #ff7f0e" />
      <use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
     </g>
    </g>
    <g id="series--xformers-meff" class="series">
-    <path d="M 83.607806 408.245077  L 226.799032 395.990127  L 369.990258 378.455027  L 513.181484 373.43287  L 656.37271 333.571508  L 799.563935 337.423698  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 83.607806 410.498293  L 226.799032 399.197519  L 369.990258 383.346345  L 513.181484 381.042612  L 656.37271 332.003214  L 799.563935 335.418073  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="mc655281e0b" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #2ca02c" />
     </defs>
     <g clip-path="url(#p09feef2583)">
-     <use ns4:href="#mc655281e0b" x="83.607806" y="408.245077" style="fill: #2ca02c; stroke: #2ca02c" />
-     <use ns4:href="#mc655281e0b" x="226.799032" y="395.990127" style="fill: #2ca02c; stroke: #2ca02c" />
-     <use ns4:href="#mc655281e0b" x="369.990258" y="378.455027" style="fill: #2ca02c; stroke: #2ca02c" />
-     <use ns4:href="#mc655281e0b" x="513.181484" y="373.43287" style="fill: #2ca02c; stroke: #2ca02c" />
-     <use ns4:href="#mc655281e0b" x="656.37271" y="333.571508" style="fill: #2ca02c; stroke: #2ca02c" />
-     <use ns4:href="#mc655281e0b" x="799.563935" y="337.423698" style="fill: #2ca02c; stroke: #2ca02c" />
+     <use ns4:href="#mc655281e0b" x="83.607806" y="410.498293" style="fill: #2ca02c; stroke: #2ca02c" />
+     <use ns4:href="#mc655281e0b" x="226.799032" y="399.197519" style="fill: #2ca02c; stroke: #2ca02c" />
+     <use ns4:href="#mc655281e0b" x="369.990258" y="383.346345" style="fill: #2ca02c; stroke: #2ca02c" />
+     <use ns4:href="#mc655281e0b" x="513.181484" y="381.042612" style="fill: #2ca02c; stroke: #2ca02c" />
+     <use ns4:href="#mc655281e0b" x="656.37271" y="332.003214" style="fill: #2ca02c; stroke: #2ca02c" />
+     <use ns4:href="#mc655281e0b" x="799.563935" y="335.418073" style="fill: #2ca02c; stroke: #2ca02c" />
     </g>
    </g>
    <g id="series--hf-kernels-flash-attn" class="series">
-    <path d="M 83.607806 415.568468  L 226.799032 400.735991  L 369.990258 386.008812  L 513.181484 387.284075  L 656.37271 338.461368  L 799.563935 341.493982  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 83.607806 418.603626  L 226.799032 405.380276  L 369.990258 389.547718  L 513.181484 382.629499  L 656.37271 335.525188  L 799.563935 340.270592  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="m61c8040d7e" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #d62728" />
     </defs>
     <g clip-path="url(#p09feef2583)">
-     <use ns4:href="#m61c8040d7e" x="83.607806" y="415.568468" style="fill: #d62728; stroke: #d62728" />
-     <use ns4:href="#m61c8040d7e" x="226.799032" y="400.735991" style="fill: #d62728; stroke: #d62728" />
-     <use ns4:href="#m61c8040d7e" x="369.990258" y="386.008812" style="fill: #d62728; stroke: #d62728" />
-     <use ns4:href="#m61c8040d7e" x="513.181484" y="387.284075" style="fill: #d62728; stroke: #d62728" />
-     <use ns4:href="#m61c8040d7e" x="656.37271" y="338.461368" style="fill: #d62728; stroke: #d62728" />
-     <use ns4:href="#m61c8040d7e" x="799.563935" y="341.493982" style="fill: #d62728; stroke: #d62728" />
+     <use ns4:href="#m61c8040d7e" x="83.607806" y="418.603626" style="fill: #d62728; stroke: #d62728" />
+     <use ns4:href="#m61c8040d7e" x="226.799032" y="405.380276" style="fill: #d62728; stroke: #d62728" />
+     <use ns4:href="#m61c8040d7e" x="369.990258" y="389.547718" style="fill: #d62728; stroke: #d62728" />
+     <use ns4:href="#m61c8040d7e" x="513.181484" y="382.629499" style="fill: #d62728; stroke: #d62728" />
+     <use ns4:href="#m61c8040d7e" x="656.37271" y="335.525188" style="fill: #d62728; stroke: #d62728" />
+     <use ns4:href="#m61c8040d7e" x="799.563935" y="340.270592" style="fill: #d62728; stroke: #d62728" />
     </g>
    </g>
    <g id="series--hf-kernels-flash-attn3" class="series">
-    <path d="M 83.607806 428.387702  L 226.799032 415.50217  L 369.990258 397.727077  L 513.181484 397.526383  L 656.37271 348.148992  L 799.563935 348.55398  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 83.607806 428.387702  L 226.799032 418.05737  L 369.990258 396.545281  L 513.181484 392.764216  L 656.37271 347.753681  L 799.563935 353.503096  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="m7cd35be9cc" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #9467bd" />
     </defs>
     <g clip-path="url(#p09feef2583)">
      <use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
-     <use ns4:href="#m7cd35be9cc" x="226.799032" y="415.50217" style="fill: #9467bd; stroke: #9467bd" />
-     <use ns4:href="#m7cd35be9cc" x="369.990258" y="397.727077" style="fill: #9467bd; stroke: #9467bd" />
-     <use ns4:href="#m7cd35be9cc" x="513.181484" y="397.526383" style="fill: #9467bd; stroke: #9467bd" />
-     <use ns4:href="#m7cd35be9cc" x="656.37271" y="348.148992" style="fill: #9467bd; stroke: #9467bd" />
-     <use ns4:href="#m7cd35be9cc" x="799.563935" y="348.55398" style="fill: #9467bd; stroke: #9467bd" />
+     <use ns4:href="#m7cd35be9cc" x="226.799032" y="418.05737" style="fill: #9467bd; stroke: #9467bd" />
+     <use ns4:href="#m7cd35be9cc" x="369.990258" y="396.545281" style="fill: #9467bd; stroke: #9467bd" />
+     <use ns4:href="#m7cd35be9cc" x="513.181484" y="392.764216" style="fill: #9467bd; stroke: #9467bd" />
+     <use ns4:href="#m7cd35be9cc" x="656.37271" y="347.753681" style="fill: #9467bd; stroke: #9467bd" />
+     <use ns4:href="#m7cd35be9cc" x="799.563935" y="353.503096" style="fill: #9467bd; stroke: #9467bd" />
     </g>
    </g>
    <g id="patch_3">
@@ -4337,48 +4337,48 @@ Summary: 6 found, 0 skipped, 0 missing
 COMBINED BENCHMARK SUMMARY
 
 impl                     wl                  p50(ms)  ok
-hf_kernels_flash_attn    cuda_attn_L128_bfloat16     0.96  True
+hf_kernels_flash_attn    cuda_attn_L128_bfloat16     0.97  True
 hf_kernels_flash_attn    cuda_attn_L256_bfloat16     1.01  True
 hf_kernels_flash_attn    cuda_attn_L320_bfloat16     1.06  True
-hf_kernels_flash_attn    cuda_attn_L384_bfloat16     1.05  True
-hf_kernels_flash_attn    cuda_attn_L448_bfloat16     1.22  True
-hf_kernels_flash_attn    cuda_attn_L512_bfloat16     1.21  True
-hf_kernels_flash_attn3   cuda_attn_L128_bfloat16     0.92  True
-hf_kernels_flash_attn3   cuda_attn_L256_bfloat16     0.96  True
-hf_kernels_flash_attn3   cuda_attn_L320_bfloat16     1.02  True
-hf_kernels_flash_attn3   cuda_attn_L384_bfloat16     1.02  True
-hf_kernels_flash_attn3   cuda_attn_L448_bfloat16     1.18  True
+hf_kernels_flash_attn    cuda_attn_L384_bfloat16     1.09  True
+hf_kernels_flash_attn    cuda_attn_L448_bfloat16     1.24  True
+hf_kernels_flash_attn    cuda_attn_L512_bfloat16     1.23  True
+hf_kernels_flash_attn3   cuda_attn_L128_bfloat16     0.94  True
+hf_kernels_flash_attn3   cuda_attn_L256_bfloat16     0.97  True
+hf_kernels_flash_attn3   cuda_attn_L320_bfloat16     1.04  True
+hf_kernels_flash_attn3   cuda_attn_L384_bfloat16     1.05  True
+hf_kernels_flash_attn3   cuda_attn_L448_bfloat16     1.20  True
 hf_kernels_flash_attn3   cuda_attn_L512_bfloat16     1.18  True
 sage_int8_fp16           cuda_attn_L128_bfloat16    FAIL  False
-  Error: module &#x27;sage_attention_fd11035eb4318b27&#x27; has no attribute &#x27;fwd&#x27;
+  Error: module &#x27;sage_attention_5c963cbdaf16559b&#x27; has no attribute &#x27;fwd&#x27;
 sage_int8_fp16           cuda_attn_L256_bfloat16    FAIL  False
-  Error: module &#x27;sage_attention_fd11035eb4318b27&#x27; has no attribute &#x27;fwd&#x27;
+  Error: module &#x27;sage_attention_5c963cbdaf16559b&#x27; has no attribute &#x27;fwd&#x27;
 sage_int8_fp16           cuda_attn_L320_bfloat16    FAIL  False
-  Error: module &#x27;sage_attention_fd11035eb4318b27&#x27; has no attribute &#x27;fwd&#x27;
+  Error: module &#x27;sage_attention_5c963cbdaf16559b&#x27; has no attribute &#x27;fwd&#x27;
 sage_int8_fp16           cuda_attn_L384_bfloat16    FAIL  False
-  Error: module &#x27;sage_attention_fd11035eb4318b27&#x27; has no attribute &#x27;fwd&#x27;
+  Error: module &#x27;sage_attention_5c963cbdaf16559b&#x27; has no attribute &#x27;fwd&#x27;
 sage_int8_fp16           cuda_attn_L448_bfloat16    FAIL  False
-  Error: module &#x27;sage_attention_fd11035eb4318b27&#x27; has no attribute &#x27;fwd&#x27;
+  Error: module &#x27;sage_attention_5c963cbdaf16559b&#x27; has no attribute &#x27;fwd&#x27;
 sage_int8_fp16           cuda_attn_L512_bfloat16    FAIL  False
-  Error: module &#x27;sage_attention_fd11035eb4318b27&#x27; has no attribute &#x27;fwd&#x27;
+  Error: module &#x27;sage_attention_5c963cbdaf16559b&#x27; has no attribute &#x27;fwd&#x27;
 torch_flash_ma           cuda_attn_L128_bfloat16     1.22  True
-torch_flash_ma           cuda_attn_L256_bfloat16     1.27  True
-torch_flash_ma           cuda_attn_L320_bfloat16     1.28  True
-torch_flash_ma           cuda_attn_L384_bfloat16     1.31  True
+torch_flash_ma           cuda_attn_L256_bfloat16     1.28  True
+torch_flash_ma           cuda_attn_L320_bfloat16     1.29  True
+torch_flash_ma           cuda_attn_L384_bfloat16     1.33  True
 torch_flash_ma           cuda_attn_L448_bfloat16     1.47  True
 torch_flash_ma           cuda_attn_L512_bfloat16     1.50  True
-torch_mem_eff            cuda_attn_L128_bfloat16     1.86  True
-torch_mem_eff            cuda_attn_L256_bfloat16     1.97  True
-torch_mem_eff            cuda_attn_L320_bfloat16     2.04  True
-torch_mem_eff            cuda_attn_L384_bfloat16     2.06  True
-torch_mem_eff            cuda_attn_L448_bfloat16     2.03  True
+torch_mem_eff            cuda_attn_L128_bfloat16     1.83  True
+torch_mem_eff            cuda_attn_L256_bfloat16     1.89  True
+torch_mem_eff            cuda_attn_L320_bfloat16     2.00  True
+torch_mem_eff            cuda_attn_L384_bfloat16     1.97  True
+torch_mem_eff            cuda_attn_L448_bfloat16     2.06  True
 torch_mem_eff            cuda_attn_L512_bfloat16     2.19  True
-xformers_meff            cuda_attn_L128_bfloat16     0.98  True
+xformers_meff            cuda_attn_L128_bfloat16     1.00  True
 xformers_meff            cuda_attn_L256_bfloat16     1.03  True
 xformers_meff            cuda_attn_L320_bfloat16     1.08  True
-xformers_meff            cuda_attn_L384_bfloat16     1.10  True
-xformers_meff            cuda_attn_L448_bfloat16     1.23  True
-xformers_meff            cuda_attn_L512_bfloat16     1.22  True
+xformers_meff            cuda_attn_L384_bfloat16     1.09  True
+xformers_meff            cuda_attn_L448_bfloat16     1.25  True
+xformers_meff            cuda_attn_L512_bfloat16     1.24  True
 
 GENERATING COMBINED VISUALIZATION
 
@@ -4402,7 +4402,7 @@ Implementations included:
 <div class="uv-install-logs" id="uv-logs-combine">
 <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 <div class="uv-logs-content" style="display: none;">
-Installed 37 packages in 187ms
+Installed 37 packages in 208ms
 </div>
 </div>
 <div class="cell-artifacts">
@@ -4415,7 +4415,7 @@ Installed 37 packages in 187ms
   <rdf:RDF>
    <ns2:Work>
     <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
-    <dc:date>2025-10-28T14:09:17.505622</dc:date>
+    <dc:date>2025-10-29T14:28:03.109695</dc:date>
     <dc:format>image/svg+xml</dc:format>
     <dc:creator>
      <ns2:Agent>
@@ -4525,96 +4525,96 @@ Installed 37 packages in 187ms
    <g id="matplotlib.axis_2">
     <g id="ytick_1">
      <g id="grid-y--2" class="grid grid-y">
-      <path d="M 47.81 403.521712  L 835.361742 403.521712  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.81 409.00723  L 835.361742 409.00723  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_7">
       <defs>
        <path id="m0fca2865ba" d="M 0 0  L -3.5 0  " style="stroke: #000000; stroke-width: 0.8" />
       </defs>
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.81" y="403.521712" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.81" y="409.00723" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_7">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="407.320931" transform="rotate(-0 40.81 407.320931)">1.0</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="412.806448" transform="rotate(-0 40.81 412.806448)">1.0</text>
      </g>
     </g>
     <g id="ytick_2">
      <g id="grid-y--3" class="grid grid-y">
-      <path d="M 47.81 343.523424  L 835.361742 343.523424  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.81 347.973099  L 835.361742 347.973099  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_8">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.81" y="343.523424" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.81" y="347.973099" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_8">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="347.322643" transform="rotate(-0 40.81 347.322643)">1.2</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="351.772318" transform="rotate(-0 40.81 351.772318)">1.2</text>
      </g>
     </g>
     <g id="ytick_3">
      <g id="grid-y--4" class="grid grid-y">
-      <path d="M 47.81 283.525136  L 835.361742 283.525136  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.81 286.938969  L 835.361742 286.938969  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_9">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.81" y="283.525136" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.81" y="286.938969" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_9">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="287.324355" transform="rotate(-0 40.81 287.324355)">1.4</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="290.738187" transform="rotate(-0 40.81 290.738187)">1.4</text>
      </g>
     </g>
     <g id="ytick_4">
      <g id="grid-y--5" class="grid grid-y">
-      <path d="M 47.81 223.526848  L 835.361742 223.526848  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.81 225.904838  L 835.361742 225.904838  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_10">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.81" y="223.526848" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.81" y="225.904838" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_10">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="227.326067" transform="rotate(-0 40.81 227.326067)">1.6</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="229.704057" transform="rotate(-0 40.81 229.704057)">1.6</text>
      </g>
     </g>
     <g id="ytick_5">
      <g id="grid-y--6" class="grid grid-y">
-      <path d="M 47.81 163.52856  L 835.361742 163.52856  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.81 164.870708  L 835.361742 164.870708  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_11">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.81" y="163.52856" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.81" y="164.870708" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_11">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="167.327779" transform="rotate(-0 40.81 167.327779)">1.8</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="168.669926" transform="rotate(-0 40.81 168.669926)">1.8</text>
      </g>
     </g>
     <g id="ytick_6">
      <g id="grid-y--7" class="grid grid-y">
-      <path d="M 47.81 103.530273  L 835.361742 103.530273  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.81 103.836577  L 835.361742 103.836577  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_12">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.81" y="103.530273" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.81" y="103.836577" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_12">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="107.329491" transform="rotate(-0 40.81 107.329491)">2.0</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="107.635796" transform="rotate(-0 40.81 107.635796)">2.0</text>
      </g>
     </g>
     <g id="ytick_7">
      <g id="grid-y--8" class="grid grid-y">
-      <path d="M 47.81 43.531985  L 835.361742 43.531985  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.81 42.802447  L 835.361742 42.802447  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_13">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.81" y="43.531985" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.81" y="42.802447" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_13">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="47.331204" transform="rotate(-0 40.81 47.331204)">2.2</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="46.601665" transform="rotate(-0 40.81 46.601665)">2.2</text>
      </g>
     </g>
     <g id="label--y" class="ylabel">
@@ -4622,73 +4622,73 @@ Installed 37 packages in 187ms
     </g>
    </g>
    <g id="series--torch-flash-ma" class="series">
-    <path d="M 83.607806 337.456697  L 226.799032 322.330829  L 369.990258 318.592935  L 513.181484 311.965825  L 656.37271 262.663131  L 799.563935 254.692359  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 83.607806 340.639848  L 226.799032 324.181385  L 369.990258 320.559009  L 513.181484 308.901185  L 656.37271 265.282228  L 799.563935 254.967155  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="md7efaf3aec" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #1f77b4" />
     </defs>
     <g clip-path="url(#p09feef2583)">
-     <use ns4:href="#md7efaf3aec" x="83.607806" y="337.456697" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="226.799032" y="322.330829" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="369.990258" y="318.592935" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="513.181484" y="311.965825" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="656.37271" y="262.663131" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="799.563935" y="254.692359" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="83.607806" y="340.639848" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="226.799032" y="324.181385" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="369.990258" y="320.559009" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="513.181484" y="308.901185" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="656.37271" y="265.282228" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="799.563935" y="254.967155" style="fill: #1f77b4; stroke: #1f77b4" />
     </g>
    </g>
    <g id="series--torch-mem-eff" class="series">
-    <path d="M 83.607806 144.033917  L 226.799032 111.747638  L 369.990258 92.42159  L 513.181484 85.353791  L 656.37271 94.728524  L 799.563935 45.999414  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 83.607806 156.748591  L 226.799032 137.315018  L 369.990258 105.143013  L 513.181484 114.228248  L 656.37271 86.655469  L 799.563935 45.999414  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="m9b8c54d372" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #ff7f0e" />
     </defs>
     <g clip-path="url(#p09feef2583)">
-     <use ns4:href="#m9b8c54d372" x="83.607806" y="144.033917" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="226.799032" y="111.747638" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="369.990258" y="92.42159" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="513.181484" y="85.353791" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="656.37271" y="94.728524" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="83.607806" y="156.748591" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="226.799032" y="137.315018" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="369.990258" y="105.143013" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="513.181484" y="114.228248" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="656.37271" y="86.655469" style="fill: #ff7f0e; stroke: #ff7f0e" />
      <use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
     </g>
    </g>
    <g id="series--xformers-meff" class="series">
-    <path d="M 83.607806 408.245077  L 226.799032 395.990127  L 369.990258 378.455027  L 513.181484 373.43287  L 656.37271 333.571508  L 799.563935 337.423698  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 83.607806 410.498293  L 226.799032 399.197519  L 369.990258 383.346345  L 513.181484 381.042612  L 656.37271 332.003214  L 799.563935 335.418073  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="mc655281e0b" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #2ca02c" />
     </defs>
     <g clip-path="url(#p09feef2583)">
-     <use ns4:href="#mc655281e0b" x="83.607806" y="408.245077" style="fill: #2ca02c; stroke: #2ca02c" />
-     <use ns4:href="#mc655281e0b" x="226.799032" y="395.990127" style="fill: #2ca02c; stroke: #2ca02c" />
-     <use ns4:href="#mc655281e0b" x="369.990258" y="378.455027" style="fill: #2ca02c; stroke: #2ca02c" />
-     <use ns4:href="#mc655281e0b" x="513.181484" y="373.43287" style="fill: #2ca02c; stroke: #2ca02c" />
-     <use ns4:href="#mc655281e0b" x="656.37271" y="333.571508" style="fill: #2ca02c; stroke: #2ca02c" />
-     <use ns4:href="#mc655281e0b" x="799.563935" y="337.423698" style="fill: #2ca02c; stroke: #2ca02c" />
+     <use ns4:href="#mc655281e0b" x="83.607806" y="410.498293" style="fill: #2ca02c; stroke: #2ca02c" />
+     <use ns4:href="#mc655281e0b" x="226.799032" y="399.197519" style="fill: #2ca02c; stroke: #2ca02c" />
+     <use ns4:href="#mc655281e0b" x="369.990258" y="383.346345" style="fill: #2ca02c; stroke: #2ca02c" />
+     <use ns4:href="#mc655281e0b" x="513.181484" y="381.042612" style="fill: #2ca02c; stroke: #2ca02c" />
+     <use ns4:href="#mc655281e0b" x="656.37271" y="332.003214" style="fill: #2ca02c; stroke: #2ca02c" />
+     <use ns4:href="#mc655281e0b" x="799.563935" y="335.418073" style="fill: #2ca02c; stroke: #2ca02c" />
     </g>
    </g>
    <g id="series--hf-kernels-flash-attn" class="series">
-    <path d="M 83.607806 415.568468  L 226.799032 400.735991  L 369.990258 386.008812  L 513.181484 387.284075  L 656.37271 338.461368  L 799.563935 341.493982  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 83.607806 418.603626  L 226.799032 405.380276  L 369.990258 389.547718  L 513.181484 382.629499  L 656.37271 335.525188  L 799.563935 340.270592  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="m61c8040d7e" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #d62728" />
     </defs>
     <g clip-path="url(#p09feef2583)">
-     <use ns4:href="#m61c8040d7e" x="83.607806" y="415.568468" style="fill: #d62728; stroke: #d62728" />
-     <use ns4:href="#m61c8040d7e" x="226.799032" y="400.735991" style="fill: #d62728; stroke: #d62728" />
-     <use ns4:href="#m61c8040d7e" x="369.990258" y="386.008812" style="fill: #d62728; stroke: #d62728" />
-     <use ns4:href="#m61c8040d7e" x="513.181484" y="387.284075" style="fill: #d62728; stroke: #d62728" />
-     <use ns4:href="#m61c8040d7e" x="656.37271" y="338.461368" style="fill: #d62728; stroke: #d62728" />
-     <use ns4:href="#m61c8040d7e" x="799.563935" y="341.493982" style="fill: #d62728; stroke: #d62728" />
+     <use ns4:href="#m61c8040d7e" x="83.607806" y="418.603626" style="fill: #d62728; stroke: #d62728" />
+     <use ns4:href="#m61c8040d7e" x="226.799032" y="405.380276" style="fill: #d62728; stroke: #d62728" />
+     <use ns4:href="#m61c8040d7e" x="369.990258" y="389.547718" style="fill: #d62728; stroke: #d62728" />
+     <use ns4:href="#m61c8040d7e" x="513.181484" y="382.629499" style="fill: #d62728; stroke: #d62728" />
+     <use ns4:href="#m61c8040d7e" x="656.37271" y="335.525188" style="fill: #d62728; stroke: #d62728" />
+     <use ns4:href="#m61c8040d7e" x="799.563935" y="340.270592" style="fill: #d62728; stroke: #d62728" />
     </g>
    </g>
    <g id="series--hf-kernels-flash-attn3" class="series">
-    <path d="M 83.607806 428.387702  L 226.799032 415.50217  L 369.990258 397.727077  L 513.181484 397.526383  L 656.37271 348.148992  L 799.563935 348.55398  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 83.607806 428.387702  L 226.799032 418.05737  L 369.990258 396.545281  L 513.181484 392.764216  L 656.37271 347.753681  L 799.563935 353.503096  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="m7cd35be9cc" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #9467bd" />
     </defs>
     <g clip-path="url(#p09feef2583)">
      <use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
-     <use ns4:href="#m7cd35be9cc" x="226.799032" y="415.50217" style="fill: #9467bd; stroke: #9467bd" />
-     <use ns4:href="#m7cd35be9cc" x="369.990258" y="397.727077" style="fill: #9467bd; stroke: #9467bd" />
-     <use ns4:href="#m7cd35be9cc" x="513.181484" y="397.526383" style="fill: #9467bd; stroke: #9467bd" />
-     <use ns4:href="#m7cd35be9cc" x="656.37271" y="348.148992" style="fill: #9467bd; stroke: #9467bd" />
-     <use ns4:href="#m7cd35be9cc" x="799.563935" y="348.55398" style="fill: #9467bd; stroke: #9467bd" />
+     <use ns4:href="#m7cd35be9cc" x="226.799032" y="418.05737" style="fill: #9467bd; stroke: #9467bd" />
+     <use ns4:href="#m7cd35be9cc" x="369.990258" y="396.545281" style="fill: #9467bd; stroke: #9467bd" />
+     <use ns4:href="#m7cd35be9cc" x="513.181484" y="392.764216" style="fill: #9467bd; stroke: #9467bd" />
+     <use ns4:href="#m7cd35be9cc" x="656.37271" y="347.753681" style="fill: #9467bd; stroke: #9467bd" />
+     <use ns4:href="#m7cd35be9cc" x="799.563935" y="353.503096" style="fill: #9467bd; stroke: #9467bd" />
     </g>
    </g>
    <g id="patch_3">
diff --git a/index.html b/index.html
index 33ea1b019a71f451c81dbc10c5e67f8c6ca9b465..1061b4b3222caa3480fdd412bcf6f18bb97b54f9 100644
--- a/index.html
+++ b/index.html
@@ -1,89 +1,4029 @@
 <!DOCTYPE html>
-<html>
+<html lang="en">
 <head>
-  <meta charset='UTF-8'>
-  <meta name='viewport' content='width=device-width, initial-scale=1.0'>
-  <title>Index of /</title>
-  <style>
-    :root {
-      --bg-primary: #0a0a0a;
-      --bg-secondary: #121212;
-      --bg-tertiary: #181818;
-      --text-primary: #e0e0e0;
-      --text-secondary: #888888;
-      --text-link: #64b5f6;
-      --border-primary: #2a2a2a;
-    }
-    body {
-      font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
-      background: var(--bg-primary);
-      color: var(--text-primary);
-      margin: 0;
-      padding: 16px;
-      max-width: 900px;
-      margin: 0 auto;
-    }
-    .controls {
-      display: flex;
-      justify-content: flex-end;
-      margin-bottom: 1rem;
-    }
-    .back-button {
-      background: var(--bg-secondary);
-      border: 1px solid var(--border-primary);
-      padding: 8px 12px;
-      border-radius: 4px;
-      color: var(--text-secondary);
-      cursor: pointer;
-      font-size: 0.9rem;
-      text-decoration: none;
-      display: inline-block;
-    }
-    .back-button:hover {
-      color: var(--text-primary);
-      background: var(--bg-tertiary);
-    }
-    h1 {
-      font-size: 1.5em;
-      margin: 1rem 0;
-      color: var(--text-primary);
-      border-bottom: 1px solid var(--border-primary);
-      padding-bottom: 0.5rem;
-    }
-    ul {
-      list-style-type: none;
-      padding: 0;
-    }
-    li {
-      margin: 0;
-      border-bottom: 1px solid var(--border-primary);
-    }
-    li:last-child {
-      border-bottom: none;
-    }
-    a {
-      display: block;
-      padding: 0.75rem 0.5rem;
-      text-decoration: none;
-      color: var(--text-link);
-      transition: background 0.2s ease;
-    }
-    a:hover {
-      background: var(--bg-secondary);
-    }
-    .dir {
-      font-weight: 500;
-    }
-  </style>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>index</title>
+    <script>
+// Iframe-friendly navigation router
+        (function() {
+            const isIframe = window.self !== window.top;
+            if (!isIframe) return; // Only activate in iframe context
+
+            // On load: if hash points to a different page, navigate there
+            const hash = window.location.hash;
+            if (hash && hash.startsWith('#/')) {
+                const targetPath = hash.slice(2); // Remove '#/'
+                const currentPath = window.location.pathname.split('/').pop();
+
+                // Only navigate if we're not already on the target page
+                if (targetPath !== currentPath) {
+                    window.location.href = targetPath;
+                    return; // Stop execution, we're navigating away
+                }
+            }
+
+            // Intercept all link clicks for hash-based navigation
+            document.addEventListener('click', function(e) {
+                const link = e.target.closest('a');
+                if (!link) return;
+
+                const href = link.getAttribute('href');
+
+                // Skip external links, anchors, and javascript: links
+                if (!href || href.startsWith('#') || href.startsWith('http') || href.startsWith('javascript:')) {
+                    return;
+                }
+
+                e.preventDefault();
+
+                // Convert relative/absolute path to hash-based navigation
+                const url = new URL(href, window.location.href);
+                let fullPath = url.pathname;
+
+                // Remove leading slash if present for cleaner paths
+                if (fullPath.startsWith('/')) {
+                    fullPath = fullPath.slice(1);
+                }
+
+                // Update parent URL hash
+                window.location.hash = '#/' + fullPath;
+
+                // For HTML files, navigate within iframe
+                if (fullPath.endsWith('.html') || fullPath.endsWith('/')) {
+                    const pathParts = fullPath.split('/').filter(p => p);
+                    const targetFile = pathParts[pathParts.length - 1] || 'index.html';
+                    window.location.href = targetFile;
+                } else {
+                    // For non-HTML files (raw .py, etc), open directly
+                    window.open(href, '_blank');
+                }
+            });
+        })();
+
+        // Apply theme and widget visibility immediately to prevent flicker
+        (function() {
+            const configTheme = 'dark';
+            const hasConfigUi = false;
+            const configUi = hasConfigUi ? 'None' : null;
+            const hasWidgetsConfig = false;
+            const widgetsOn = hasWidgetsConfig ? false : true;
+            let theme;
+            if (configTheme === 'auto') {
+                theme = window.matchMedia('(prefers-color-scheme: dark)').matches ? 'dark' : 'light';
+            } else {
+                theme = localStorage.getItem('uvnote-theme') || configTheme;
+            }
+            document.documentElement.setAttribute('data-theme', theme);
+
+            // Initialize UI theme (css theme)
+            let ui = hasConfigUi ? configUi : (localStorage.getItem('uvnote-ui') || 'default');
+            if (ui !== 'default' && ui !== 'none' && ui !== 'monocolor') { ui = 'default'; }
+            document.documentElement.setAttribute('data-ui', ui);
+
+            // Apply widgets visibility
+            document.documentElement.setAttribute('data-widgets', widgetsOn ? 'on' : 'off');
+        })();
+    </script>
+    <style>
+:root[data-theme="light"] {
+            --bg-primary: #ffffff;
+            --bg-secondary: #f6f8fa;
+            --bg-tertiary: #f8f9fa;
+            --bg-code: #f8f9fa;
+            --bg-error: #fdf2f2;
+            --bg-artifact: #e6f3ff;
+            --bg-artifact-hover: #d0e7ff;
+            
+            --text-primary: #333;
+            --text-secondary: #656d76;
+            --text-error: #c53030;
+            --text-link: #0969da;
+            
+            --border-primary: #e1e5e9;
+            --border-error: #e53e3e;
+            --border-cell-failed: #d73a49;
+            
+            --shadow: rgba(0, 0, 0, 0.1);
+        }
+
+        :root[data-theme="dark"] {
+            --bg-primary: #0a0a0a;
+            --bg-secondary: #121212;
+            --bg-tertiary: #181818;
+            --bg-code: #0d0d0d;
+            --bg-error: #1a0f0f;
+            --bg-artifact: #151515;
+            --bg-artifact-hover: #1a1a1a;
+            
+            --text-primary: #e0e0e0;
+            --text-secondary: #888888;
+            --text-error: #ff6b6b;
+            --text-link: #64b5f6;
+            
+            --border-primary: #2a2a2a;
+            --border-error: #ff6b6b;
+            --border-cell-failed: #ff6b6b;
+            
+            --shadow: rgba(255, 255, 255, 0.05);
+        }
+        /* Monocolor UI theme: black/white background, all text/borders single blue */
+        :root[data-ui="monocolor"] { --mono-color: #0a66ff; }
+        :root[data-ui="monocolor"][data-theme="light"] {
+            --bg-primary: #ffffff;
+        }
+        :root[data-ui="monocolor"][data-theme="dark"] {
+            --bg-primary: #000000;
+        }
+        :root[data-ui="monocolor"] {
+            --bg-secondary: var(--bg-primary);
+            --bg-tertiary: var(--bg-primary);
+            --bg-code: var(--bg-primary);
+            --bg-error: var(--bg-primary);
+            --bg-artifact: var(--bg-primary);
+            --bg-artifact-hover: var(--bg-primary);
+
+            --text-primary: var(--mono-color);
+            --text-secondary: var(--mono-color);
+            --text-error: var(--mono-color);
+            --text-link: var(--mono-color);
+
+            --border-primary: var(--mono-color);
+            --border-error: var(--mono-color);
+            --border-cell-failed: var(--mono-color);
+
+            --shadow: none;
+        }
+        :root[data-ui="monocolor"] a { color: var(--mono-color); }
+        :root[data-ui="monocolor"] .menu-button,
+        :root[data-ui="monocolor"] .theme-toggle,
+        :root[data-ui="monocolor"] .reset-toggle,
+        :root[data-ui="monocolor"] .back-button { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); }
+        :root[data-ui="monocolor"] .menu-button:hover,
+        :root[data-ui="monocolor"] .theme-toggle:hover,
+        :root[data-ui="monocolor"] .reset-toggle:hover,
+        :root[data-ui="monocolor"] .back-button:hover { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); }
+        :root[data-ui="monocolor"] .menu-dropdown { background: var(--bg-primary); border-color: var(--mono-color); box-shadow: none; }
+        :root[data-ui="monocolor"] .menu-item { color: var(--mono-color); border-bottom-color: var(--mono-color); }
+        :root[data-ui="monocolor"] .system-info { background: var(--bg-primary); border-color: var(--mono-color); }
+        :root[data-ui="monocolor"] .cell { border-color: var(--mono-color); background: var(--bg-primary); }
+        :root[data-ui="monocolor"] .cell-header { background: var(--bg-primary); border-bottom-color: var(--mono-color); }
+        :root[data-ui="monocolor"] .artifact { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); }
+        :root[data-ui="monocolor"] .artifact:hover { background: var(--bg-primary); }
+        :root[data-ui="monocolor"] .artifact-preview img,
+        :root[data-ui="monocolor"] .artifact-preview svg { border-color: var(--mono-color); }
+        :root[data-ui="monocolor"] .status-widget { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); }
+        :root[data-ui="monocolor"] .minimap,
+        :root[data-ui="monocolor"] .file-explorer,
+        :root[data-ui="monocolor"] .tools-widget {
+            background: var(--bg-primary);
+            border-color: var(--mono-color);
+            color: var(--mono-color);
+        }
+        :root[data-ui="monocolor"] .cell-code {
+            background: var(--bg-primary);
+            border-bottom-color: var(--mono-color);
+        }
+        :root[data-ui="monocolor"] .tools-title,
+        :root[data-ui="monocolor"] .file-explorer-section-title,
+        :root[data-ui="monocolor"] .minimap-title { color: var(--mono-color); border-bottom-color: var(--mono-color); }
+        :root[data-ui="monocolor"] .tool-button { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); }
+        :root[data-ui="monocolor"] .tool-button.active { border-color: var(--mono-color); }
+        :root[data-ui="monocolor"] .file-explorer-item,
+        :root[data-ui="monocolor"] .minimap-item { color: var(--mono-color); }
+        /* Force Pygments code to mono blue on mono bg */
+        :root[data-ui="monocolor"] .highlight { background: var(--bg-primary) !important; color: var(--mono-color) !important; }
+        :root[data-ui="monocolor"] .highlight *,
+        :root[data-ui="monocolor"] .highlight .hll { color: var(--mono-color) !important; background: transparent !important; border-color: var(--mono-color) !important; }
+        /* Default code font + metrics (overridable via frontmatter) */
+        :root { --code-font-size: 0.95rem; --code-line-height: 1.5; --code-pad-y: 0.75rem; }
+        /* Minimal UI theme overrides base variables for a flatter, 90s look */
+        :root[data-ui="none"] {
+            --bg-primary: #ffffff;
+            --bg-secondary: transparent;
+            --bg-tertiary: transparent;
+            --bg-code: #f9f9f9;
+            --bg-error: #fff0f0;
+            --bg-artifact: #f0f7ff;
+            --bg-artifact-hover: #e5f1ff;
+
+            --text-primary: #000000;
+            --text-secondary: #222222;
+            --text-error: #a00000;
+            --text-link: #0000ee;
+
+            --border-primary: #cccccc;
+            --border-error: #cc0000;
+            --border-cell-failed: #cc0000;
+
+            --shadow: none;
+        }
+        html {
+            overscroll-behavior: none;
+        }
+        body {
+            font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace;
+            line-height: 1.4;
+            max-width: 1000px;
+            margin: 0 auto;
+            padding: 15px;
+            color: var(--text-primary);
+            background: var(--bg-primary);
+            transition: background-color 0.2s ease, color 0.2s ease;
+            overscroll-behavior: none;
+        }
+        /* Minimal "none" UI theme overrides */
+        :root[data-ui="none"] body {
+            font-family: 'Times New Roman', Times, serif;
+            line-height: 1.5;
+            max-width: 860px;
+            padding: 12px;
+            background: #ffffff;
+            color: #000000;
+            transition: none;
+        }
+        
+        /* Two panel layout removed */
+        
+        .controls {
+            position: fixed;
+            top: 20px;
+            right: 20px;
+            display: flex;
+            flex-direction: column;
+            align-items: flex-end;
+            gap: 0.25rem;
+            z-index: 1000;
+        }
+        .controls-buttons { display: flex; gap: 0.5rem; }
+        
+        .menu-button {
+            position: relative;
+            background: var(--bg-secondary);
+            border: 1px solid var(--border-primary);
+            padding: 8px 12px;
+            border-radius: 2px;
+            color: var(--text-secondary);
+            cursor: pointer;
+            font-family: inherit;
+            font-size: 0.9rem;
+            user-select: none;
+        }
+        /* Keep default control styling when widgets are enabled, even in minimal UI */
+        :root[data-ui="none"][data-widgets="on"] .menu-button,
+        :root[data-ui="none"][data-widgets="on"] .theme-toggle,
+        :root[data-ui="none"][data-widgets="on"] .reset-toggle,
+        :root[data-ui="none"][data-widgets="on"] .back-button {
+            background: #f6f6f6;
+            border: 1px solid #cccccc;
+            color: #222222;
+        }
+        
+        .menu-button:hover {
+            color: var(--text-primary);
+            background: var(--bg-tertiary);
+        }
+        /* Controls state indicator (top-right) */
+        /* Status widget (bottom-right) */
+        .status-widget {
+            position: fixed;
+            right: 20px;
+            bottom: 20px;
+            width: auto;
+            max-width: 260px;
+            background: var(--bg-secondary);
+            border: 1px solid var(--border-primary);
+            border-radius: 2px;
+            padding: 6px 8px;
+            font-size: 0.8rem;
+            color: var(--text-secondary);
+            z-index: 100;
+        }
+        .status-widget strong { color: var(--text-primary); }
+        :root[data-ui="none"][data-widgets="on"] .status-widget { background: #f6f6f6; border-color: #ccc; color: #222; }
+        :root[data-ui="none"][data-widgets="on"] .menu-button:hover,
+        :root[data-ui="none"][data-widgets="on"] .theme-toggle:hover,
+        :root[data-ui="none"][data-widgets="on"] .reset-toggle:hover,
+        :root[data-ui="none"][data-widgets="on"] .back-button:hover {
+            background: #ededed;
+            border-color: #bbbbbb;
+            color: #000000;
+        }
+        
+        .menu-dropdown {
+            position: absolute;
+            top: 100%;
+            right: 0;
+            background: var(--bg-secondary);
+            border: 1px solid var(--border-primary);
+            border-radius: 4px;
+            box-shadow: 0 4px 12px var(--shadow);
+            min-width: 160px;
+            opacity: 0;
+            visibility: hidden;
+            transform: translateY(-8px);
+            transition: all 0.2s ease;
+            z-index: 1001;
+            margin-top: 4px;
+        }
+        :root[data-ui="none"][data-widgets="on"] .menu-dropdown { background: #ffffff; border: 1px solid #cccccc; box-shadow: none; }
+        
+        .menu-button.active .menu-dropdown {
+            opacity: 1;
+            visibility: visible;
+            transform: translateY(0);
+        }
+        
+        .menu-item {
+            display: block;
+            padding: 8px 12px;
+            color: var(--text-secondary);
+            text-decoration: none;
+            font-size: 0.85rem;
+            border-bottom: 1px solid var(--border-primary);
+            cursor: pointer;
+        }
+        :root[data-ui="none"] .menu-item { color: #000; border-bottom: 1px solid #eee; }
+        
+        .menu-item:last-child {
+            border-bottom: none;
+        }
+        
+        .menu-item:hover {
+            background: var(--bg-tertiary);
+            color: var(--text-primary);
+        }
+        
+        .menu-checkbox {
+            display: inline-block;
+            width: 16px;
+            font-family: monospace;
+            color: var(--text-link);
+        }
+        
+        .theme-toggle,
+        .reset-toggle,
+        .back-button {
+            background: var(--bg-secondary);
+            border: 1px solid var(--border-primary);
+            padding: 8px 12px;
+            border-radius: 4px;
+            color: var(--text-secondary);
+            cursor: pointer;
+            font-family: inherit;
+            font-size: 0.9rem;
+            user-select: none;
+        }
+
+        .back-button {
+            text-decoration: none;
+            display: inline-block;
+        }
+
+        .theme-toggle:hover,
+        .reset-toggle:hover,
+        .back-button:hover {
+            color: var(--text-primary);
+            background: var(--bg-tertiary);
+        }
+        
+        .system-info {
+            background: var(--bg-secondary);
+            border: 1px solid var(--border-primary);
+            border-radius: 4px;
+            padding: 8px 12px;
+            margin-bottom: 16px;
+            font-size: 0.85em;
+            color: var(--text-secondary);
+        }
+        
+        .system-info-header {
+            font-weight: 600;
+            color: var(--text-primary);
+            margin-bottom: 2px;
+        }
+        
+        .system-info-content {
+            font-family: monospace;
+        }
+        
+        .theme-toggle, .reset-toggle {
+            background: var(--bg-secondary);
+            border: 1px solid var(--border-primary);
+            border-radius: 2px;
+            /* padding: 0.4rem 0.6rem; */
+            cursor: pointer;
+            font-family: inherit;
+            font-size: 0.8rem;
+            color: var(--text-secondary);
+            user-select: none;
+            transition: all 0.2s ease;
+            text-transform: lowercase;
+            letter-spacing: 0;
+        }
+        
+        .theme-toggle:hover, .reset-toggle:hover {
+            background: var(--bg-tertiary);
+            border-color: var(--text-secondary);
+            color: var(--text-primary);
+        }
+        
+        .minimap {
+            position: fixed;
+            bottom: 20px;
+            right: 20px;
+            width: 220px;
+            max-height: 400px;
+            background: var(--bg-secondary);
+            border: 1px solid var(--border-primary);
+            border-radius: 2px;
+            padding: 0.5rem;
+            font-size: 0.7rem;
+            overflow-y: auto;
+            z-index: 100;
+            opacity: 0.9;
+            transition: opacity 0.2s ease;
+        }
+        /* Hide widgets and controls when disabled via frontmatter */
+        :root[data-widgets="off"] .controls,
+        :root[data-widgets="off"] .minimap,
+        :root[data-widgets="off"] .file-explorer,
+        :root[data-widgets="off"] .tools-widget,
+        :root[data-widgets="off"] .status-widget { display: none !important; }
+        
+        .file-explorer {
+            position: fixed;
+            bottom: 20px; /* default; JS will stack */
+            right: 20px;
+            left: auto;
+            top: auto;
+            width: 220px;
+            max-height: 400px;
+            background: var(--bg-secondary);
+            border: 1px solid var(--border-primary);
+            border-radius: 2px;
+            padding: 0.5rem;
+            font-size: 0.7rem;
+            overflow-y: auto;
+            z-index: 100;
+            opacity: 0.9;
+            transition: opacity 0.2s ease;
+        }
+
+        /* Drawing overlay */
+        .draw-overlay {
+            position: fixed;
+            top: 0;
+            left: 0;
+            width: 100vw;
+            height: 100vh;
+            z-index: 80; /* under widgets (100) and controls (1000) */
+            display: block;
+            pointer-events: none; /* enabled only when a tool is active */
+        }
+
+        /* Tools widget */
+        .tools-widget {
+            position: fixed;
+            bottom: 20px; /* default; JS will stack */
+            right: 20px;
+            left: auto;
+            top: auto;
+            width: 220px;
+            background: var(--bg-secondary);
+            border: 1px solid var(--border-primary);
+            border-radius: 2px;
+            padding: 0.5rem;
+            font-size: 0.7rem;
+            z-index: 100;
+            opacity: 0.95;
+        }
+        .tools-title {
+            font-weight: bold;
+            color: var(--text-secondary);
+            margin-bottom: 0.5rem;
+            padding-bottom: 0.25rem;
+            border-bottom: 1px solid var(--border-primary);
+            cursor: grab;
+            user-select: none;
+        }
+        .tools-row { display: flex; gap: 0.4rem; flex-wrap: wrap; }
+        .tool-button {
+            background: var(--bg-tertiary);
+            border: 1px solid var(--border-primary);
+            border-radius: 2px;
+            padding: 0.25rem 0.4rem;
+            cursor: pointer;
+            color: var(--text-secondary);
+            font-family: inherit;
+            font-size: 0.75rem;
+            user-select: none;
+        }
+        .tool-button:hover { color: var(--text-primary); }
+        .tool-button.active { color: var(--text-primary); border-color: var(--text-secondary); background: var(--bg-secondary); }
+        
+        .minimap:hover, .file-explorer:hover {
+            opacity: 1;
+        }
+        
+        .minimap-title {
+            font-weight: bold;
+            color: var(--text-secondary);
+            margin-bottom: 0.5rem;
+            padding-bottom: 0.25rem;
+            border-bottom: 1px solid var(--border-primary);
+            cursor: grab; /* drag handle */
+            user-select: none;
+        }
+        
+        .minimap-item {
+            display: block;
+            color: var(--text-secondary);
+            text-decoration: none;
+            padding: 0.15rem 0;
+            border-left: 2px solid transparent;
+            padding-left: 0.5rem;
+            transition: all 0.2s ease;
+            cursor: pointer;
+        }
+        
+        .minimap-item:hover {
+            color: var(--text-primary);
+            border-left-color: var(--text-secondary);
+        }
+        
+        .minimap-item.active {
+            color: var(--text-primary);
+            border-left-color: var(--text-link);
+        }
+        
+        .minimap-heading {
+            font-weight: normal;
+        }
+        
+        .minimap-heading.h1 { padding-left: 0.5rem; }
+        .minimap-heading.h2 { padding-left: 1rem; }
+        .minimap-heading.h3 { padding-left: 1.5rem; }
+        .minimap-heading.h4 { padding-left: 2rem; }
+        .minimap-heading.h5 { padding-left: 2.5rem; }
+        .minimap-heading.h6 { padding-left: 3rem; }
+        
+        .minimap-cell {
+            color: var(--text-link);
+            opacity: 0.8;
+            font-style: italic;
+        }
+        
+        .minimap-cell:hover {
+            opacity: 1;
+        }
+        
+        .file-explorer-title {
+            font-weight: bold;
+            color: var(--text-secondary);
+            margin-bottom: 0.5rem;
+            padding-bottom: 0.25rem;
+            border-bottom: 1px solid var(--border-primary);
+            cursor: grab; /* drag handle */
+            user-select: none;
+        }
+        
+        .file-explorer-section {
+            margin-bottom: 0.75rem;
+        }
+        
+        .file-explorer-section-title {
+            font-weight: bold;
+            color: var(--text-secondary);
+            font-size: 0.65rem;
+            margin-bottom: 0.25rem;
+            text-transform: uppercase;
+            letter-spacing: 0.5px;
+        }
+        
+        .file-explorer-item {
+            display: block;
+            color: var(--text-secondary);
+            text-decoration: none;
+            padding: 0.1rem 0;
+            margin-left: 0.5rem;
+            transition: color 0.2s ease;
+            cursor: pointer;
+            font-family: monospace;
+        }
+        
+        .file-explorer-item:hover {
+            color: var(--text-primary);
+        }
+        
+        .file-explorer-item.script {
+            color: var(--text-link);
+        }
+        
+        .file-explorer-item.artifact {
+            color: var(--text-secondary);
+            opacity: 0.8;
+        }
+        
+
+        /* Hide widgets on smaller screens */
+        @media (max-width: 768px) {
+            .minimap, .file-explorer, .tools-widget {
+                display: none;
+            }
+        }
+        
+        .cell {
+            margin: 1rem 0;
+            border: 1px solid var(--border-primary);
+            border-radius: 2px;
+            overflow: hidden;
+            background: var(--bg-secondary);
+        }
+        :root[data-ui="none"] .cell { margin: 1em 0; border: none; background: transparent; }
+        .cell-header {
+            background: var(--bg-secondary);
+            padding: 0.5rem 1rem;
+            border-bottom: 1px solid var(--border-primary);
+            font-family: inherit;
+            font-size: 0.85rem;
+        }
+        :root[data-ui="none"] .cell-header { background: transparent; border: none; padding: 0; font-weight: bold; }
+        :root[data-ui="none"] .cell-content { padding: 0; }
+        :root[data-ui="none"] .copy-button,
+        :root[data-ui="none"] .collapse-indicators,
+        :root[data-ui="none"] .cell-meta,
+        :root[data-ui="none"] .cell-outputs-header { display: none !important; }
+        :root[data-ui="none"] pre, :root[data-ui="none"] code { font-family: Menlo, Monaco, 'Courier New', monospace; }
+        :root[data-ui="none"] .code-content pre { background: #f9f9f9; border: 1px solid #ddd; padding: 8px; }
+        :root[data-ui="none"] .output { background: transparent; border: none; padding: 0.25em 0; }
+            color: var(--text-secondary);
+            cursor: pointer;
+            user-select: none;
+            transition: background-color 0.2s ease;
+        }
+        .cell-header:hover {
+            background: var(--bg-tertiary);
+        }
+        .collapse-indicators {
+            color: var(--text-secondary);
+            font-size: 0.8rem;
+            opacity: 0.7;
+        }
+        .collapse-indicators span:hover {
+            color: var(--text-primary);
+            opacity: 1;
+        }
+        .cell-code {
+            display: block;
+            background: var(--bg-code);
+        }
+        .cell-code.collapsed {
+            display: none;
+        }
+        .cell-code pre {
+            margin: 0;
+            padding: 0.75rem;
+            background: var(--bg-code);
+            overflow-x: auto;
+            color: var(--text-primary);
+        }
+        .cell-output {
+            padding: 0.75rem;
+            /* background: var(--bg-primary); */
+            background: var(--bg-secondary);
+        }
+        .cell-output.collapsed {
+            display: none;
+        }
+        .cell-stdout {
+            background: var(--bg-tertiary);
+            padding: 0.75rem;
+            border-radius: 1px;
+            /* margin: 0.25rem 0; */
+            font-family: inherit;
+            font-size: 0.9rem;
+            white-space: pre-wrap;
+            color: var(--text-primary);
+        }
+
+        .cell-stdout {
+            background: var(--bg-tertiary);
+            padding: 0.75rem;
+            border-radius: 1px;
+            font-family: inherit;
+            font-size: 0.9rem;
+            color: var(--text-primary);
+
+            /* key bits */
+            overflow: auto;          /* show scrollbars when needed */
+            max-width: 100%;         /* respects whatever layout width you give it */
+        }
+
+        .cell-stdout .stdout-text {
+            margin: 0;               /* reset pre default margin */
+            white-space: pre;        /* keep line breaks, NO wrapping */
+            display: inline-block;   /* shrink-to-content */
+            min-width: max-content;  /* allow very long lines to define intrinsic width */
+            font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
+            tab-size: 2;
+        }
+
+        .cell-stderr {
+            background: var(--bg-error);
+            border-left: 2px solid var(--border-error);
+            padding: 1rem;
+            margin: 0.5rem 0;
+            font-family: inherit;
+            font-size: 0.9rem;
+            color: var(--text-error);
+            white-space: pre-wrap;
+        }
+        .uv-install-logs {
+            margin: 0.5rem 0;
+        }
+        .uv-logs-header {
+            cursor: pointer;
+            padding: 0.75rem;
+            border-left: 3px solid var(--border-color);
+            font-family: inherit;
+            font-size: 0.85rem;
+            color: var(--text-secondary);
+            user-select: none;
+        }
+        .uv-logs-content {
+            background: var(--bg-secondary);
+            padding: 1rem;
+            border-left: 3px solid var(--border-color);
+            white-space: pre-wrap;
+            font-family: monospace;
+            font-size: 0.85rem;
+            color: var(--text-secondary);
+            overflow-x: auto;
+        }
+        .cell-artifacts {
+            margin: 1rem 0;
+        }
+        .cell-artifacts h4 {
+            margin: 0 0 0.5rem 0;
+            color: var(--text-secondary);
+            font-size: 0.9rem;
+        }
+        .artifact {
+            display: inline-block;
+            background: var(--bg-artifact);
+            padding: 0.25rem 0.5rem;
+            border-radius: 1px;
+            margin: 0.25rem 0.5rem 0.25rem 0;
+            font-family: inherit;
+            font-size: 0.8rem;
+            color: var(--text-link);
+            text-decoration: none;
+            transition: background-color 0.2s ease;
+            border: 1px solid var(--border-primary);
+        }
+        .artifact:hover {
+            background: var(--bg-artifact-hover);
+        }
+        .artifact-preview {
+            margin-top: 1rem;
+        }
+        .artifact-preview img {
+            max-width: 100%;
+            height: auto;
+            border: 1px solid var(--border-primary);
+            border-radius: 1px;
+        }
+        .artifact-preview svg {
+            max-width: 100%;
+            height: auto;
+            border: 1px solid var(--border-primary);
+            border-radius: 1px;
+            display: block;
+        }
+        /* Style SVG text elements */
+        .artifact-preview svg g {
+            fill: var(--text-primary) !important;
+        }
+        /* Auto-theme SVG elements */
+        .artifact-preview svg {
+            background: transparent;
+        }
+        /* CSV table styling */
+        .artifact-csv {
+            margin-top: 1rem;
+            overflow-x: auto;
+        }
+        .csv-table {
+            width: 100%;
+            border-collapse: collapse;
+            font-size: 0.9rem;
+            background: var(--bg-secondary);
+            border: 1px solid var(--border-primary);
+            border-radius: 1px;
+        }
+        .csv-table th,
+        .csv-table td {
+            padding: 0.5rem 0.75rem;
+            text-align: left;
+            border: 1px solid var(--border-primary);
+        }
+        .csv-table th {
+            background: var(--bg-tertiary);
+            font-weight: 600;
+            color: var(--text-primary);
+        }
+        .csv-table tbody tr:hover {
+            background: var(--bg-artifact-hover);
+        }
+        .artifact-csv-error {
+            margin-top: 1rem;
+            padding: 1rem;
+            background: var(--bg-error);
+            color: var(--text-error);
+            border: 1px solid var(--border-error);
+            border-radius: 1px;
+        }
+        .cell-failed {
+            border-color: var(--border-cell-failed);
+        }
+        .cell-failed .cell-header {
+            background: var(--bg-error);
+            color: var(--text-error);
+        }
+        .cell-commented {
+            opacity: 0.6;
+            border-style: dashed;
+        }
+        .cell-commented .cell-header {
+            background: var(--bg-secondary);
+            color: var(--text-secondary);
+            font-style: italic;
+        }
+        .run-btn {
+            background: var(--bg-tertiary);
+            border: 1px solid var(--border-primary);
+            padding: 2px 6px;
+            border-radius: 2px;
+            color: var(--text-secondary);
+            cursor: pointer;
+            font-size: 0.75em;
+            font-family: inherit;
+            margin-left: 4px;
+        }
+        .run-btn:hover {
+            color: var(--text-primary);
+            background: var(--bg-primary);
+        }
+        .run-btn:disabled {
+            opacity: 0.6;
+            cursor: not-allowed;
+        }
+        .copy-btn {
+            background: var(--bg-tertiary);
+            border: 1px solid var(--border-primary);
+            padding: 2px 6px;
+            border-radius: 2px;
+            color: var(--text-secondary);
+            cursor: pointer;
+            font-size: 0.75em;
+            font-family: inherit;
+            margin-left: 4px;
+        }
+        .copy-btn:hover {
+            color: var(--text-primary);
+            background: var(--bg-primary);
+        }
+        .copy-btn:disabled {
+            opacity: 0.6;
+            cursor: not-allowed;
+        }
+        .copy-btn.copied {
+            color: #4caf50;
+            background: var(--bg-primary);
+            border-color: #4caf50;
+            transition: all 0.2s ease;
+        }
+        .raw-btn {
+            background: var(--bg-tertiary);
+            border: 1px solid var(--border-primary);
+            padding: 2px 6px;
+            border-radius: 2px;
+            color: var(--text-secondary);
+            cursor: pointer;
+            font-size: 0.75em;
+            font-family: inherit;
+            margin-left: 4px;
+            text-decoration: none;
+            display: inline-block;
+        }
+        .raw-btn:hover {
+            color: var(--text-primary);
+            background: var(--bg-primary);
+            text-decoration: none;
+        }
+        .github-btn {
+            background: var(--bg-tertiary);
+            border: 1px solid var(--border-primary);
+            padding: 2px 6px;
+            border-radius: 2px;
+            color: var(--text-secondary);
+            cursor: pointer;
+            font-size: 0.75em;
+            font-family: inherit;
+            margin-left: 4px;
+            text-decoration: none;
+            display: inline-block;
+        }
+        .github-btn:hover {
+            color: var(--text-primary);
+            background: var(--bg-primary);
+            text-decoration: none;
+        }
+        .hf-btn {
+            background: var(--bg-tertiary);
+            border: 1px solid var(--border-primary);
+            padding: 2px 6px;
+            border-radius: 2px;
+            color: var(--text-secondary);
+            cursor: pointer;
+            font-size: 0.75em;
+            font-family: inherit;
+            margin-left: 4px;
+            text-decoration: none;
+            display: inline-block;
+        }
+        .hf-btn:hover {
+            color: var(--text-primary);
+            background: var(--bg-primary);
+            text-decoration: none;
+        }
+        .output-stale {
+            opacity: 0.5;
+            position: relative;
+        }
+        .output-stale::after {
+            content: '⏳ updating...';
+            position: absolute;
+            top: 8px;
+            right: 8px;
+            background: var(--bg-secondary);
+            padding: 4px 8px;
+            border-radius: 2px;
+            font-size: 0.75em;
+            color: var(--text-secondary);
+            border: 1px solid var(--border-primary);
+        }
+        h1, h2, h3, h4, h5, h6 {
+            margin-top: 1.5rem;
+            margin-bottom: 0.75rem;
+            color: var(--text-primary);
+        }
+        h1 {
+            margin-top: 0;
+            margin-bottom: 1rem;
+        }
+        p {
+            margin: 0.75rem 0;
+            color: var(--text-primary);
+        }
+        a {
+            color: var(--text-link);
+        }
+        img {
+            max-width: 100%;
+            height: auto;
+            border-radius: 1px;
+            box-shadow: none;
+        }
+        pre, code {
+            font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace;
+            font-size: var(--code-font-size);
+        }
+        .code-wrap { position: relative; }
+        .code-line-highlight { display: none; position: absolute; left: 0; right: 0; height: 1.5em; background: rgba(255, 235, 170, 0.35); pointer-events: none; border-left: 3px solid #f4c542; }
+        .line-number { cursor: pointer; text-decoration: none; color: var(--text-secondary); padding: 0 0.25rem; }
+        .line-number.selected { background: rgba(255, 235, 170, 0.4); color: var(--text-primary); }
+        
+        /* Line numbers */
+        .highlight-with-lines {
+            display: flex;
+        }
+        .line-numbers {
+            background: var(--bg-tertiary);
+            padding: var(--code-pad-y) 0.5rem;
+            font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace;
+            font-size: var(--code-font-size);
+            line-height: var(--code-line-height);
+            color: var(--text-secondary);
+            user-select: none;
+            text-align: right;
+            border-right: 1px solid var(--border-primary);
+        }
+        .line-numbers .line-number {
+            display: block;
+            line-height: var(--code-line-height);
+        }
+        .highlight-with-lines .highlight {
+            flex: 1;
+        }
+        .highlight .hll { background-color: transparent; } /* don't conflict with our highlight */
+        .highlight pre {
+            white-space: pre;
+            margin: 0;
+            padding: var(--code-pad-y) 0.75rem;
+            line-height: var(--code-line-height);
+        }
+        
+        /* Collapsed code styling */
+        .cell-code.collapsed {
+            display: none;
+        }
+        .cell-code.expanded {
+            display: block;
+        }
+        
+        .cell-code {
+            display: block;
+            border-bottom: 1px solid var(--border-primary);
+        }
+        
+        
+        pre { line-height: 125%; }
+td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; }
+span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; }
+td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; }
+span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; }
+[data-theme="light"] .highlight .hll { background-color: #ffffcc }
+[data-theme="light"] .highlight { background: #f8f8f8; }
+[data-theme="light"] .highlight .c { color: #3D7B7B; font-style: italic } /* Comment */
+[data-theme="light"] .highlight .err { border: 1px solid #F00 } /* Error */
+[data-theme="light"] .highlight .k { color: #008000; font-weight: bold } /* Keyword */
+[data-theme="light"] .highlight .o { color: #666 } /* Operator */
+[data-theme="light"] .highlight .ch { color: #3D7B7B; font-style: italic } /* Comment.Hashbang */
+[data-theme="light"] .highlight .cm { color: #3D7B7B; font-style: italic } /* Comment.Multiline */
+[data-theme="light"] .highlight .cp { color: #9C6500 } /* Comment.Preproc */
+[data-theme="light"] .highlight .cpf { color: #3D7B7B; font-style: italic } /* Comment.PreprocFile */
+[data-theme="light"] .highlight .c1 { color: #3D7B7B; font-style: italic } /* Comment.Single */
+[data-theme="light"] .highlight .cs { color: #3D7B7B; font-style: italic } /* Comment.Special */
+[data-theme="light"] .highlight .gd { color: #A00000 } /* Generic.Deleted */
+[data-theme="light"] .highlight .ge { font-style: italic } /* Generic.Emph */
+[data-theme="light"] .highlight .ges { font-weight: bold; font-style: italic } /* Generic.EmphStrong */
+[data-theme="light"] .highlight .gr { color: #E40000 } /* Generic.Error */
+[data-theme="light"] .highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */
+[data-theme="light"] .highlight .gi { color: #008400 } /* Generic.Inserted */
+[data-theme="light"] .highlight .go { color: #717171 } /* Generic.Output */
+[data-theme="light"] .highlight .gp { color: #000080; font-weight: bold } /* Generic.Prompt */
+[data-theme="light"] .highlight .gs { font-weight: bold } /* Generic.Strong */
+[data-theme="light"] .highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */
+[data-theme="light"] .highlight .gt { color: #04D } /* Generic.Traceback */
+[data-theme="light"] .highlight .kc { color: #008000; font-weight: bold } /* Keyword.Constant */
+[data-theme="light"] .highlight .kd { color: #008000; font-weight: bold } /* Keyword.Declaration */
+[data-theme="light"] .highlight .kn { color: #008000; font-weight: bold } /* Keyword.Namespace */
+[data-theme="light"] .highlight .kp { color: #008000 } /* Keyword.Pseudo */
+[data-theme="light"] .highlight .kr { color: #008000; font-weight: bold } /* Keyword.Reserved */
+[data-theme="light"] .highlight .kt { color: #B00040 } /* Keyword.Type */
+[data-theme="light"] .highlight .m { color: #666 } /* Literal.Number */
+[data-theme="light"] .highlight .s { color: #BA2121 } /* Literal.String */
+[data-theme="light"] .highlight .na { color: #687822 } /* Name.Attribute */
+[data-theme="light"] .highlight .nb { color: #008000 } /* Name.Builtin */
+[data-theme="light"] .highlight .nc { color: #00F; font-weight: bold } /* Name.Class */
+[data-theme="light"] .highlight .no { color: #800 } /* Name.Constant */
+[data-theme="light"] .highlight .nd { color: #A2F } /* Name.Decorator */
+[data-theme="light"] .highlight .ni { color: #717171; font-weight: bold } /* Name.Entity */
+[data-theme="light"] .highlight .ne { color: #CB3F38; font-weight: bold } /* Name.Exception */
+[data-theme="light"] .highlight .nf { color: #00F } /* Name.Function */
+[data-theme="light"] .highlight .nl { color: #767600 } /* Name.Label */
+[data-theme="light"] .highlight .nn { color: #00F; font-weight: bold } /* Name.Namespace */
+[data-theme="light"] .highlight .nt { color: #008000; font-weight: bold } /* Name.Tag */
+[data-theme="light"] .highlight .nv { color: #19177C } /* Name.Variable */
+[data-theme="light"] .highlight .ow { color: #A2F; font-weight: bold } /* Operator.Word */
+[data-theme="light"] .highlight .w { color: #BBB } /* Text.Whitespace */
+[data-theme="light"] .highlight .mb { color: #666 } /* Literal.Number.Bin */
+[data-theme="light"] .highlight .mf { color: #666 } /* Literal.Number.Float */
+[data-theme="light"] .highlight .mh { color: #666 } /* Literal.Number.Hex */
+[data-theme="light"] .highlight .mi { color: #666 } /* Literal.Number.Integer */
+[data-theme="light"] .highlight .mo { color: #666 } /* Literal.Number.Oct */
+[data-theme="light"] .highlight .sa { color: #BA2121 } /* Literal.String.Affix */
+[data-theme="light"] .highlight .sb { color: #BA2121 } /* Literal.String.Backtick */
+[data-theme="light"] .highlight .sc { color: #BA2121 } /* Literal.String.Char */
+[data-theme="light"] .highlight .dl { color: #BA2121 } /* Literal.String.Delimiter */
+[data-theme="light"] .highlight .sd { color: #BA2121; font-style: italic } /* Literal.String.Doc */
+[data-theme="light"] .highlight .s2 { color: #BA2121 } /* Literal.String.Double */
+[data-theme="light"] .highlight .se { color: #AA5D1F; font-weight: bold } /* Literal.String.Escape */
+[data-theme="light"] .highlight .sh { color: #BA2121 } /* Literal.String.Heredoc */
+[data-theme="light"] .highlight .si { color: #A45A77; font-weight: bold } /* Literal.String.Interpol */
+[data-theme="light"] .highlight .sx { color: #008000 } /* Literal.String.Other */
+[data-theme="light"] .highlight .sr { color: #A45A77 } /* Literal.String.Regex */
+[data-theme="light"] .highlight .s1 { color: #BA2121 } /* Literal.String.Single */
+[data-theme="light"] .highlight .ss { color: #19177C } /* Literal.String.Symbol */
+[data-theme="light"] .highlight .bp { color: #008000 } /* Name.Builtin.Pseudo */
+[data-theme="light"] .highlight .fm { color: #00F } /* Name.Function.Magic */
+[data-theme="light"] .highlight .vc { color: #19177C } /* Name.Variable.Class */
+[data-theme="light"] .highlight .vg { color: #19177C } /* Name.Variable.Global */
+[data-theme="light"] .highlight .vi { color: #19177C } /* Name.Variable.Instance */
+[data-theme="light"] .highlight .vm { color: #19177C } /* Name.Variable.Magic */
+[data-theme="light"] .highlight .il { color: #666 } /* Literal.Number.Integer.Long */
+
+pre { line-height: 125%; }
+td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; }
+span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; }
+td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; }
+span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; }
+[data-theme="dark"] .highlight .hll { background-color: #49483e }
+[data-theme="dark"] .highlight { background: #272822; color: #F8F8F2 }
+[data-theme="dark"] .highlight .c { color: #959077 } /* Comment */
+[data-theme="dark"] .highlight .err { color: #ED007E; background-color: #1E0010 } /* Error */
+[data-theme="dark"] .highlight .esc { color: #F8F8F2 } /* Escape */
+[data-theme="dark"] .highlight .g { color: #F8F8F2 } /* Generic */
+[data-theme="dark"] .highlight .k { color: #66D9EF } /* Keyword */
+[data-theme="dark"] .highlight .l { color: #AE81FF } /* Literal */
+[data-theme="dark"] .highlight .n { color: #F8F8F2 } /* Name */
+[data-theme="dark"] .highlight .o { color: #FF4689 } /* Operator */
+[data-theme="dark"] .highlight .x { color: #F8F8F2 } /* Other */
+[data-theme="dark"] .highlight .p { color: #F8F8F2 } /* Punctuation */
+[data-theme="dark"] .highlight .ch { color: #959077 } /* Comment.Hashbang */
+[data-theme="dark"] .highlight .cm { color: #959077 } /* Comment.Multiline */
+[data-theme="dark"] .highlight .cp { color: #959077 } /* Comment.Preproc */
+[data-theme="dark"] .highlight .cpf { color: #959077 } /* Comment.PreprocFile */
+[data-theme="dark"] .highlight .c1 { color: #959077 } /* Comment.Single */
+[data-theme="dark"] .highlight .cs { color: #959077 } /* Comment.Special */
+[data-theme="dark"] .highlight .gd { color: #FF4689 } /* Generic.Deleted */
+[data-theme="dark"] .highlight .ge { color: #F8F8F2; font-style: italic } /* Generic.Emph */
+[data-theme="dark"] .highlight .ges { color: #F8F8F2; font-weight: bold; font-style: italic } /* Generic.EmphStrong */
+[data-theme="dark"] .highlight .gr { color: #F8F8F2 } /* Generic.Error */
+[data-theme="dark"] .highlight .gh { color: #F8F8F2 } /* Generic.Heading */
+[data-theme="dark"] .highlight .gi { color: #A6E22E } /* Generic.Inserted */
+[data-theme="dark"] .highlight .go { color: #66D9EF } /* Generic.Output */
+[data-theme="dark"] .highlight .gp { color: #FF4689; font-weight: bold } /* Generic.Prompt */
+[data-theme="dark"] .highlight .gs { color: #F8F8F2; font-weight: bold } /* Generic.Strong */
+[data-theme="dark"] .highlight .gu { color: #959077 } /* Generic.Subheading */
+[data-theme="dark"] .highlight .gt { color: #F8F8F2 } /* Generic.Traceback */
+[data-theme="dark"] .highlight .kc { color: #66D9EF } /* Keyword.Constant */
+[data-theme="dark"] .highlight .kd { color: #66D9EF } /* Keyword.Declaration */
+[data-theme="dark"] .highlight .kn { color: #FF4689 } /* Keyword.Namespace */
+[data-theme="dark"] .highlight .kp { color: #66D9EF } /* Keyword.Pseudo */
+[data-theme="dark"] .highlight .kr { color: #66D9EF } /* Keyword.Reserved */
+[data-theme="dark"] .highlight .kt { color: #66D9EF } /* Keyword.Type */
+[data-theme="dark"] .highlight .ld { color: #E6DB74 } /* Literal.Date */
+[data-theme="dark"] .highlight .m { color: #AE81FF } /* Literal.Number */
+[data-theme="dark"] .highlight .s { color: #E6DB74 } /* Literal.String */
+[data-theme="dark"] .highlight .na { color: #A6E22E } /* Name.Attribute */
+[data-theme="dark"] .highlight .nb { color: #F8F8F2 } /* Name.Builtin */
+[data-theme="dark"] .highlight .nc { color: #A6E22E } /* Name.Class */
+[data-theme="dark"] .highlight .no { color: #66D9EF } /* Name.Constant */
+[data-theme="dark"] .highlight .nd { color: #A6E22E } /* Name.Decorator */
+[data-theme="dark"] .highlight .ni { color: #F8F8F2 } /* Name.Entity */
+[data-theme="dark"] .highlight .ne { color: #A6E22E } /* Name.Exception */
+[data-theme="dark"] .highlight .nf { color: #A6E22E } /* Name.Function */
+[data-theme="dark"] .highlight .nl { color: #F8F8F2 } /* Name.Label */
+[data-theme="dark"] .highlight .nn { color: #F8F8F2 } /* Name.Namespace */
+[data-theme="dark"] .highlight .nx { color: #A6E22E } /* Name.Other */
+[data-theme="dark"] .highlight .py { color: #F8F8F2 } /* Name.Property */
+[data-theme="dark"] .highlight .nt { color: #FF4689 } /* Name.Tag */
+[data-theme="dark"] .highlight .nv { color: #F8F8F2 } /* Name.Variable */
+[data-theme="dark"] .highlight .ow { color: #FF4689 } /* Operator.Word */
+[data-theme="dark"] .highlight .pm { color: #F8F8F2 } /* Punctuation.Marker */
+[data-theme="dark"] .highlight .w { color: #F8F8F2 } /* Text.Whitespace */
+[data-theme="dark"] .highlight .mb { color: #AE81FF } /* Literal.Number.Bin */
+[data-theme="dark"] .highlight .mf { color: #AE81FF } /* Literal.Number.Float */
+[data-theme="dark"] .highlight .mh { color: #AE81FF } /* Literal.Number.Hex */
+[data-theme="dark"] .highlight .mi { color: #AE81FF } /* Literal.Number.Integer */
+[data-theme="dark"] .highlight .mo { color: #AE81FF } /* Literal.Number.Oct */
+[data-theme="dark"] .highlight .sa { color: #E6DB74 } /* Literal.String.Affix */
+[data-theme="dark"] .highlight .sb { color: #E6DB74 } /* Literal.String.Backtick */
+[data-theme="dark"] .highlight .sc { color: #E6DB74 } /* Literal.String.Char */
+[data-theme="dark"] .highlight .dl { color: #E6DB74 } /* Literal.String.Delimiter */
+[data-theme="dark"] .highlight .sd { color: #E6DB74 } /* Literal.String.Doc */
+[data-theme="dark"] .highlight .s2 { color: #E6DB74 } /* Literal.String.Double */
+[data-theme="dark"] .highlight .se { color: #AE81FF } /* Literal.String.Escape */
+[data-theme="dark"] .highlight .sh { color: #E6DB74 } /* Literal.String.Heredoc */
+[data-theme="dark"] .highlight .si { color: #E6DB74 } /* Literal.String.Interpol */
+[data-theme="dark"] .highlight .sx { color: #E6DB74 } /* Literal.String.Other */
+[data-theme="dark"] .highlight .sr { color: #E6DB74 } /* Literal.String.Regex */
+[data-theme="dark"] .highlight .s1 { color: #E6DB74 } /* Literal.String.Single */
+[data-theme="dark"] .highlight .ss { color: #E6DB74 } /* Literal.String.Symbol */
+[data-theme="dark"] .highlight .bp { color: #F8F8F2 } /* Name.Builtin.Pseudo */
+[data-theme="dark"] .highlight .fm { color: #A6E22E } /* Name.Function.Magic */
+[data-theme="dark"] .highlight .vc { color: #F8F8F2 } /* Name.Variable.Class */
+[data-theme="dark"] .highlight .vg { color: #F8F8F2 } /* Name.Variable.Global */
+[data-theme="dark"] .highlight .vi { color: #F8F8F2 } /* Name.Variable.Instance */
+[data-theme="dark"] .highlight .vm { color: #F8F8F2 } /* Name.Variable.Magic */
+[data-theme="dark"] .highlight .il { color: #AE81FF } /* Literal.Number.Integer.Long */
+
+        /* Ensure our code metrics override Pygments defaults */
+        .highlight pre {
+            white-space: pre;
+            margin: 0;
+            padding: var(--code-pad-y) 0.75rem !important;
+            line-height: var(--code-line-height) !important;
+            font-size: var(--code-font-size) !important;
+            font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace !important;
+            border: none;
+        }
+        .line-numbers { line-height: var(--code-line-height) !important; }
+        .line-numbers .line-number { line-height: var(--code-line-height) !important; }
+
+        /* Custom CSS from frontmatter */
+        
+
+        
+        
+        
+        /* Cursor for tools */
+        body[data-tool="arrow"] .main-content { 
+            cursor: url('data:image/svg+xml;utf8,<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="%23e53935" stroke-width="2"><path d="M12 19l7-7 3 3-7 7-3-3z"/><path d="M18 13l-1.5-7.5L2 2l3.5 14.5L13 18l5-5z"/><path d="M2 2l7.586 7.586"/><circle cx="11" cy="11" r="2"/></svg>') 12 12, crosshair;
+        }
+        body[data-tool="pen"] .main-content { 
+            cursor: url('data:image/svg+xml;utf8,<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="%23e53935" stroke-width="2"><path d="M12 19l7-7 3 3-7 7-3-3z"/><path d="M18 13l-1.5-7.5L2 2l3.5 14.5L13 18l5-5z"/><circle cx="4" cy="20" r="2" fill="%23e53935"/></svg>') 4 20, pointer;
+        }
+        body[data-tool="eraser"] .main-content { 
+            cursor: url('data:image/svg+xml;utf8,<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="%23e53935" stroke-width="2"><path d="M20 20H7l-7-7 7-7h13v14z"/><path d="M13 13l7-7"/><path d="M13 13L9 9"/></svg>') 12 12, auto;
+        }
+
+        /* Color picker styles */
+        .tools-section-title {
+            font-weight: bold;
+            color: var(--text-secondary);
+            font-size: 0.65rem;
+            margin: 0.75rem 0 0.5rem 0;
+            text-transform: uppercase;
+            letter-spacing: 0.5px;
+        }
+        .color-row {
+            display: grid;
+            grid-template-columns: repeat(6, 1fr);
+            gap: 0.25rem;
+            margin-bottom: 0.5rem;
+        }
+        .color-swatch {
+            width: 18px;
+            height: 18px;
+            border: 2px solid var(--border-primary);
+            border-radius: 3px;
+            cursor: pointer;
+            transition: all 0.2s ease;
+            position: relative;
+        }
+        .color-swatch:hover {
+            transform: scale(1.1);
+            border-color: var(--text-secondary);
+        }
+        .color-swatch.selected {
+            border-color: var(--text-primary);
+            box-shadow: 0 0 0 2px var(--text-link);
+        }
+        .color-swatch.selected::after {
+            content: '✓';
+            position: absolute;
+            top: 50%;
+            left: 50%;
+            transform: translate(-50%, -50%);
+            color: white;
+            font-size: 10px;
+            font-weight: bold;
+            text-shadow: 1px 1px 1px black;
+        }
+        .color-input {
+            width: 24px;
+            height: 24px;
+            border: 2px solid var(--border-primary);
+            border-radius: 3px;
+            cursor: pointer;
+            background: none;
+            padding: 0;
+            grid-column: span 2;
+            justify-self: center;
+        }
+        .color-input:hover {
+            border-color: var(--text-secondary);
+        }
+        
+        /* Thickness slider styles */
+        .thickness-row {
+            display: flex;
+            align-items: center;
+            gap: 0.5rem;
+            margin-top: 0.75rem;
+        }
+        .thickness-slider {
+            flex: 1;
+            -webkit-appearance: none;
+            appearance: none;
+            height: 4px;
+            background: var(--border-primary);
+            border-radius: 2px;
+            outline: none;
+            opacity: 0.7;
+            transition: opacity 0.2s;
+        }
+        .thickness-slider:hover {
+            opacity: 1;
+        }
+        .thickness-slider::-webkit-slider-thumb {
+            -webkit-appearance: none;
+            appearance: none;
+            width: 12px;
+            height: 12px;
+            background: var(--text-link);
+            border-radius: 50%;
+            cursor: pointer;
+        }
+        .thickness-slider::-moz-range-thumb {
+            width: 12px;
+            height: 12px;
+            background: var(--text-link);
+            border-radius: 50%;
+            cursor: pointer;
+            border: none;
+        }
+        .thickness-value {
+            font-size: 0.7rem;
+            color: var(--text-secondary);
+            min-width: 20px;
+            text-align: right;
+        }
+
+        .highlight {
+            background: none !important;
+        }
+        
+        /* Loading animations */
+        .loading-spinner {
+            display: inline-block;
+            width: 16px;
+            height: 16px;
+            border: 2px solid var(--border-primary);
+            border-radius: 50%;
+            border-top-color: var(--text-link);
+            animation: spin 1s linear infinite;
+            margin-right: 8px;
+            vertical-align: middle;
+        }
+        
+        @keyframes spin {
+            to { transform: rotate(360deg); }
+        }
+        
+        .loading-skeleton {
+            display: inline-block;
+            background: var(--bg-tertiary);
+            background: linear-gradient(
+                90deg,
+                var(--bg-tertiary) 25%,
+                var(--bg-secondary) 50%,
+                var(--bg-tertiary) 75%
+            );
+            background-size: 200% 100%;
+            animation: loading-shimmer 2s ease-in-out infinite;
+            border-radius: 2px;
+            height: 1em;
+            width: 80px;
+            vertical-align: middle;
+        }
+        
+        @keyframes loading-shimmer {
+            0% { background-position: -200% 0; }
+            100% { background-position: 200% 0; }
+        }
+        
+        /* Loading state for cell output */
+        .cell-output:has(.loading-spinner) {
+            opacity: 0.7;
+            background: var(--bg-secondary);
+            /* border-left: 3px solid var(--text-link); */
+        }
+    </style>
+    <script>
+// --- Drag utilities ---
+        function clamp(val, min, max) { return Math.max(min, Math.min(max, val)); }
+
+        function restorePosition(el, storageKey) {
+            try {
+                const raw = localStorage.getItem(storageKey);
+                if (!raw) return;
+                const pos = JSON.parse(raw);
+                if (typeof pos.left === 'number' && typeof pos.top === 'number') {
+                    el.style.left = pos.left + 'px';
+                    el.style.top = pos.top + 'px';
+                    el.style.right = 'auto';
+                    el.style.bottom = 'auto';
+                }
+            } catch (_) {}
+        }
+
+        function savePosition(el, storageKey) {
+            try {
+                const left = parseFloat(el.style.left || 'NaN');
+                const top = parseFloat(el.style.top || 'NaN');
+                if (!Number.isNaN(left) && !Number.isNaN(top)) {
+                    localStorage.setItem(storageKey, JSON.stringify({ left, top }));
+                }
+            } catch (_) {}
+        }
+
+
+        function makeDraggable(el, storageKey, handleEl) {
+            let dragging = false;
+            let startX = 0, startY = 0; // cursor
+            let origLeft = 0, origTop = 0; // element
+
+            const onMove = (e) => {
+                if (!dragging) return;
+                const clientX = e.touches ? e.touches[0].clientX : e.clientX;
+                const clientY = e.touches ? e.touches[0].clientY : e.clientY;
+                const dx = clientX - startX;
+                const dy = clientY - startY;
+                const w = el.offsetWidth;
+                const h = el.offsetHeight;
+                const maxX = window.innerWidth - w;
+                const maxY = window.innerHeight - h;
+                const newLeft = clamp(origLeft + dx, 0, maxX);
+                const newTop = clamp(origTop + dy, 0, maxY);
+                el.style.left = newLeft + 'px';
+                el.style.top = newTop + 'px';
+                el.style.right = 'auto';
+                el.style.bottom = 'auto';
+            };
+
+            const endDrag = () => {
+                if (!dragging) return;
+                dragging = false;
+                document.removeEventListener('mousemove', onMove);
+                document.removeEventListener('mouseup', endDrag);
+                document.removeEventListener('touchmove', onMove);
+                document.removeEventListener('touchend', endDrag);
+                handleEl && (handleEl.style.cursor = 'grab');
+                savePosition(el, storageKey);
+                // ensure no-overlap constraint after a drag
+                try { layoutWidgetsStackedBottomRight(); } catch (_) {}
+            };
+
+            const startDrag = (e) => {
+                // Start from element's current on-screen rect
+                const elRect = el.getBoundingClientRect();
+                el.style.left = elRect.left + 'px';
+                el.style.top = elRect.top + 'px';
+                el.style.right = 'auto';
+                el.style.bottom = 'auto';
+
+                dragging = true;
+                startX = e.touches ? e.touches[0].clientX : e.clientX;
+                startY = e.touches ? e.touches[0].clientY : e.clientY;
+                origLeft = elRect.left;
+                origTop = elRect.top;
+
+                document.addEventListener('mousemove', onMove);
+                document.addEventListener('mouseup', endDrag);
+                document.addEventListener('touchmove', onMove, { passive: false });
+                document.addEventListener('touchend', endDrag);
+                handleEl && (handleEl.style.cursor = 'grabbing');
+                e.preventDefault();
+            };
+
+            (handleEl || el).addEventListener('mousedown', startDrag);
+            (handleEl || el).addEventListener('touchstart', startDrag, { passive: false });
+
+            // Apply any saved position on init
+            restorePosition(el, storageKey);
+        }
+        function toggleCell(cellId) {
+            const codeElement = document.getElementById('code-' + cellId);
+            const outputElement = document.getElementById('output-' + cellId);
+            
+            if (codeElement) {
+                codeElement.classList.toggle('collapsed');
+            }
+            if (outputElement) {
+                outputElement.classList.toggle('collapsed');
+            }
+            
+            updateIndicators(cellId);
+            encodeToolStateToUrl();
+        }
+        
+        function toggleCode(cellId) {
+            const codeElement = document.getElementById('code-' + cellId);
+            if (codeElement) {
+                codeElement.classList.toggle('collapsed');
+                updateIndicators(cellId);
+                encodeToolStateToUrl();
+            }
+        }
+        
+        function toggleOutput(cellId) {
+            const outputElement = document.getElementById('output-' + cellId);
+            if (outputElement) {
+                outputElement.classList.toggle('collapsed');
+                updateIndicators(cellId);
+                encodeToolStateToUrl();
+            }
+        }
+
+        function toggleUvLogs(headerElement) {
+            const contentElement = headerElement.nextElementSibling;
+            if (contentElement) {
+                const isCollapsed = contentElement.style.display === 'none';
+                contentElement.style.display = isCollapsed ? 'block' : 'none';
+                headerElement.textContent = isCollapsed ? '▼ UV Install Logs' : '▶ UV Install Logs';
+
+                // Update the header indicator if it exists
+                const uvLogsDiv = headerElement.parentElement;
+                if (uvLogsDiv && uvLogsDiv.id && uvLogsDiv.id.startsWith('uv-logs-')) {
+                    const cellId = uvLogsDiv.id.replace('uv-logs-', '');
+                    const indicatorElement = document.getElementById('uv-indicator-' + cellId);
+                    if (indicatorElement) {
+                        indicatorElement.textContent = isCollapsed ? '▼ uv-logs' : '▶ uv-logs';
+                    }
+                }
+            }
+        }
+
+        function toggleUvLogsFromHeader(cellId) {
+            const uvLogsElement = document.getElementById('uv-logs-' + cellId);
+            const indicatorElement = document.getElementById('uv-indicator-' + cellId);
+            if (uvLogsElement) {
+                const headerElement = uvLogsElement.querySelector('.uv-logs-header');
+                const contentElement = uvLogsElement.querySelector('.uv-logs-content');
+                if (contentElement && headerElement) {
+                    const isCollapsed = contentElement.style.display === 'none';
+                    contentElement.style.display = isCollapsed ? 'block' : 'none';
+                    headerElement.textContent = isCollapsed ? '▼ UV Install Logs' : '▶ UV Install Logs';
+                    if (indicatorElement) {
+                        indicatorElement.textContent = isCollapsed ? '▼ uv-logs' : '▶ uv-logs';
+                    }
+                }
+            }
+        }
+        
+        function updateIndicators(cellId) {
+            const codeElement = document.getElementById('code-' + cellId);
+            const outputElement = document.getElementById('output-' + cellId);
+            const indicators = document.querySelector(`[onclick*="${cellId}"]`)?.closest('.cell-header')?.querySelector('.collapse-indicators');
+            
+            if (indicators) {
+                const codeCollapsed = codeElement && codeElement.classList.contains('collapsed');
+                const outputCollapsed = outputElement && outputElement.classList.contains('collapsed');
+                
+                const codeIcon = codeCollapsed ? '▶' : '▼';
+                const outputIcon = outputCollapsed ? '▶' : '▼';
+                
+                const codeSpan = indicators.querySelector('[onclick*="toggleCode"]');
+                const outputSpan = indicators.querySelector('[onclick*="toggleOutput"]');
+                
+                if (codeSpan) codeSpan.innerHTML = `${codeIcon} code`;
+                if (outputSpan) outputSpan.innerHTML = `${outputIcon} output`;
+            }
+        }
+        
+        function toggleTheme() {
+            const html = document.documentElement;
+            const currentTheme = html.getAttribute('data-theme');
+            const newTheme = currentTheme === 'dark' ? 'light' : 'dark';
+            html.setAttribute('data-theme', newTheme);
+            localStorage.setItem('uvnote-theme', newTheme);
+            updateThemeIcon();
+            updateUiDebug();
+        }
+        
+        // Two panel code removed
+        
+        function updateThemeIcon() {
+            const theme = document.documentElement.getAttribute('data-theme');
+            const toggle = document.querySelector('.theme-toggle');
+            if (toggle) {
+                toggle.textContent = theme === 'dark' ? 'light' : 'dark';
+            }
+        }
+        function setUiTheme(newUi) {
+            if (newUi !== 'default' && newUi !== 'none' && newUi !== 'monocolor') return;
+            const html = document.documentElement;
+            html.setAttribute('data-ui', newUi);
+            try { localStorage.setItem('uvnote-ui', newUi); } catch (_) {}
+            updateUiMenu();
+            updateUiDebug();
+        }
+        function updateUiMenu() {
+            const ui = document.documentElement.getAttribute('data-ui') || 'default';
+            const checks = {
+                default: document.getElementById('checkbox-ui-default'),
+                none: document.getElementById('checkbox-ui-none'),
+                monocolor: document.getElementById('checkbox-ui-monocolor')
+            };
+            if (checks.default) checks.default.textContent = ui === 'default' ? '☑' : '☐';
+            if (checks.none) checks.none.textContent = ui === 'none' ? '☑' : '☐';
+            if (checks.monocolor) checks.monocolor.textContent = ui === 'monocolor' ? '☑' : '☐';
+        }
+
+        function updateUiDebug() {
+            const ui = document.documentElement.getAttribute('data-ui') || 'default';
+            const color = document.documentElement.getAttribute('data-theme') || 'light';
+            const el = document.getElementById('ui-debug');
+            if (el) {
+                el.textContent = `UI: ${ui} | Color: ${color}`;
+            }
+        }
+
+        // Line selection and deep-linking
+        function clearLineSelections() {
+            try {
+                document.querySelectorAll('.code-line-highlight').forEach(el => { el.style.display = 'none'; });
+                document.querySelectorAll('.line-number.selected').forEach(el => el.classList.remove('selected'));
+            } catch (_) {}
+        }
+
+        let _selection = null; // { cellId, a, b }
+
+        function clearSelection(updateUrl) {
+            clearLineSelections();
+            _selection = null;
+            if (updateUrl) {
+                try {
+                    const url = new URL(window.location.href);
+                    url.searchParams.delete('cell');
+                    url.searchParams.delete('line');
+                    history.replaceState(null, '', url.toString());
+                } catch (_) {}
+            }
+            updateStateIndicator();
+        }
+
+        function selectCellLine(cellId, line, updateUrl) {
+            try {
+                // Ensure only one selection across the whole document
+                clearLineSelections();
+                const codeBox = document.getElementById(`code-${cellId}`);
+                if (!codeBox) return;
+                const pre = codeBox.querySelector('.highlight pre');
+                const overlay = document.getElementById(`line-highlight-${cellId}`);
+                const numbers = document.getElementById(`lines-${cellId}`);
+                if (!pre || !overlay) return;
+
+                // Measure line height directly from computed style
+                const preStyle = getComputedStyle(pre);
+                const padTop = parseFloat(preStyle.paddingTop || '0');
+                const lh = parseFloat(preStyle.lineHeight || '20');
+
+                // Position overlay
+                overlay.style.display = 'block';
+                overlay.style.height = `${lh}px`;
+                overlay.style.top = `${pre.offsetTop + padTop + (line - 1) * lh}px`;
+
+                // Update selected class in line numbers
+                if (numbers) {
+                    numbers.querySelectorAll('.line-number').forEach(a => a.classList.remove('selected'));
+                    const sel = numbers.querySelector(`.line-number[data-line="${line}"]`);
+                    if (sel) sel.classList.add('selected');
+                }
+
+                if (updateUrl) {
+                    const url = new URL(window.location.href);
+                    url.searchParams.set('cell', cellId);
+                    url.searchParams.set('line', String(line));
+                    history.replaceState(null, '', url.toString());
+                }
+                _selection = { cellId, a: line, b: line };
+                updateStateIndicator();
+            } catch (e) { console.warn('selectCellLine error', e); }
+        }
+
+        function selectCellLines(cellId, startLine, endLine, updateUrl) {
+            try {
+                // normalize order
+                const a = Math.min(startLine, endLine);
+                const b = Math.max(startLine, endLine);
+                clearLineSelections();
+                const codeBox = document.getElementById(`code-${cellId}`);
+                if (!codeBox) return;
+                const pre = codeBox.querySelector('.highlight pre');
+                const overlay = document.getElementById(`line-highlight-${cellId}`);
+                const numbers = document.getElementById(`lines-${cellId}`);
+                if (!pre || !overlay) return;
+
+                const preStyle = getComputedStyle(pre);
+                const padTop = parseFloat(preStyle.paddingTop || '0');
+                const lh = parseFloat(preStyle.lineHeight || '20');
+
+                overlay.style.display = 'block';
+                overlay.style.top = `${pre.offsetTop + padTop + (a - 1) * lh}px`;
+                overlay.style.height = `${(b - a + 1) * lh}px`;
+
+                if (numbers) {
+                    numbers.querySelectorAll('.line-number').forEach(a => a.classList.remove('selected'));
+                    for (let i = a; i <= b; i++) {
+                        const el = numbers.querySelector(`.line-number[data-line="${i}"]`);
+                        if (el) el.classList.add('selected');
+                    }
+                }
+
+                if (updateUrl) {
+                    const url = new URL(window.location.href);
+                    url.searchParams.set('cell', cellId);
+                    if (a === b) url.searchParams.set('line', String(a));
+                    else url.searchParams.set('line', `${a}-${b}`);
+                    history.replaceState(null, '', url.toString());
+                }
+                _selection = { cellId, a, b };
+                updateStateIndicator();
+            } catch (e) { console.warn('selectCellLines error', e); }
+        }
+
+        // Drag-to-select support on line numbers
+        let _lineDrag = { active: false, cellId: null, start: 0 };
+        function onLineNumberMouseDown(e) {
+            const a = e.target.closest('.line-number');
+            if (!a) return;
+            e.preventDefault();
+            const cellId = a.dataset.cell;
+            const line = parseInt(a.dataset.line || '1', 10) || 1;
+            // Toggle off if this exact single line is already the only selection
+            const numbers = document.getElementById(`lines-${cellId}`);
+            if (numbers) {
+                const selected = Array.from(numbers.querySelectorAll('.line-number.selected')).map(n => parseInt(n.dataset.line||'0',10)).filter(Boolean);
+                if (selected.length === 1 && selected[0] === line) {
+                    clearSelection(true);
+                    return;
+                }
+            }
+            _lineDrag.active = true;
+            _lineDrag.cellId = cellId;
+            _lineDrag.start = line;
+            selectCellLines(_lineDrag.cellId, _lineDrag.start, _lineDrag.start, false);
+        }
+        function onDocMouseMove(e) {
+            if (!_lineDrag.active) return;
+            const el = document.elementFromPoint(e.clientX, e.clientY);
+            if (!el) return;
+            const a = el.closest && el.closest('.line-number');
+            if (!a) return;
+            if (a.dataset.cell !== _lineDrag.cellId) return;
+            const cur = parseInt(a.dataset.line || '1', 10) || 1;
+            selectCellLines(_lineDrag.cellId, _lineDrag.start, cur, false);
+        }
+        function onDocMouseUp(e) {
+            if (!_lineDrag.active) return;
+            const last = document.querySelector('.line-number.selected:last-of-type');
+            // finalize URL using the current selected range
+            const numbers = document.getElementById(`lines-${_lineDrag.cellId}`);
+            if (numbers) {
+                const selected = Array.from(numbers.querySelectorAll('.line-number.selected')).map(n => parseInt(n.dataset.line||'0',10)).filter(Boolean);
+                if (selected.length) {
+                    const a = Math.min(...selected); const b = Math.max(...selected);
+                    selectCellLines(_lineDrag.cellId, a, b, true);
+                }
+            }
+            _lineDrag.active = false; _lineDrag.cellId = null; _lineDrag.start = 0;
+        }
+
+        function applyLocationFromUrl() {
+            try {
+                const url = new URL(window.location.href);
+                const cell = url.searchParams.get('cell');
+                const lineParam = url.searchParams.get('line');
+                if (cell && lineParam) {
+                    if (lineParam.includes('-')) {
+                        const [a, b] = lineParam.split('-').map(x => parseInt(x, 10));
+                        if (!Number.isNaN(a) && !Number.isNaN(b)) selectCellLines(cell, a, b, false);
+                    } else {
+                        const l = parseInt(lineParam, 10);
+                        if (!Number.isNaN(l)) selectCellLine(cell, l, false);
+                    }
+                }
+                
+                // Apply tool parameters from URL
+                applyToolsFromUrl(url.searchParams);
+                
+                // Cell states will be applied later in DOMContentLoaded with proper timing
+                const encodedCellStates = url.searchParams.get('cells');
+                console.log('Encoded cell states from URL:', encodedCellStates);
+            } catch (_) {}
+        }
+        
+        function applyToolsFromUrl(params) {
+            try {
+                // Check if tools widget should be shown
+                const showTools = params.get('tools');
+                if (showTools === '1') {
+                    // Mark that tool was loaded from URL
+                    _urlLoadedTool = true;
+                    
+                    // Apply color
+                    const color = params.get('color');
+                    if (color && /^[0-9a-fA-F]{6}$/.test(color)) {
+                        setStoredArrowColor('#' + color);
+                    }
+                    
+                    // Apply thickness
+                    const thickness = params.get('thickness');
+                    if (thickness) {
+                        const value = parseInt(thickness, 10);
+                        if (value >= 1 && value <= 10) {
+                            setStoredLineThickness(value);
+                        }
+                    }
+                    
+                    // Don't override fadeout time for URL-loaded tools - let individual shapes decide
+                    
+                    // Load shapes from URL
+                    const encodedShapes = params.get('shapes');
+                    if (encodedShapes) {
+                        const decodedShapes = decodeShapesFromUrl(encodedShapes);
+                        if (decodedShapes.length > 0) {
+                            _shapes = decodedShapes;
+                            saveShapes();
+                            // Trigger render after overlay is initialized
+                            setTimeout(() => {
+                                renderOverlay();
+                            }, 300);
+                        }
+                    }
+                    
+                    // Wait for widgets to be initialized before showing tools
+                    setTimeout(() => {
+                        const toolsWidget = document.querySelector('.tools-widget');
+                        const checkbox = document.getElementById('checkbox-tools');
+                        if (toolsWidget && checkbox) {
+                            toolsWidget.style.display = 'block';
+                            checkbox.textContent = '☑';
+                            localStorage.setItem('uvnote-widget-tools', 'visible');
+                        }
+                        
+                        // Apply active tool
+                        const activeTool = params.get('tool');
+                        if (activeTool && ['arrow', 'pen', 'eraser', 'spotlight'].includes(activeTool)) {
+                            const toolBtn = Array.from(document.querySelectorAll('.tool-button')).find(btn => btn.textContent === activeTool);
+                            if (toolBtn) {
+                                toolBtn.click();
+                            }
+                        }
+                        
+                        // Re-layout widgets after showing tools
+                        layoutWidgetsStackedBottomRight();
+                    }, 200);
+                }
+            } catch (_) {}
+        }
+        
+        function captureInitialCellStates() {
+            const cells = document.querySelectorAll('.cell');
+            cells.forEach(cell => {
+                const cellId = cell.id.replace('cell-', '');
+                const codeEl = document.getElementById('code-' + cellId);
+                const outputEl = document.getElementById('output-' + cellId);
+                
+                if (codeEl || outputEl) {
+                    const state = {};
+                    if (codeEl) {
+                        state.c = codeEl.classList.contains('collapsed') ? 0 : 1;
+                    }
+                    if (outputEl) {
+                        state.o = outputEl.classList.contains('collapsed') ? 0 : 1;
+                    }
+                    _initialCellStates[cellId] = state;
+                }
+            });
+            console.log('Captured initial cell states:', _initialCellStates);
+        }
+
+        function encodeCellStatesToUrl() {
+            // Get all cells and their collapse states
+            const cells = document.querySelectorAll('.cell');
+            const cellStates = {};
+            
+            console.log('Found cells:', cells.length);
+            
+            cells.forEach(cell => {
+                const cellId = cell.id.replace('cell-', '');
+                const codeEl = document.getElementById('code-' + cellId);
+                const outputEl = document.getElementById('output-' + cellId);
+                const initialState = _initialCellStates[cellId] || {};
+                
+                console.log(`Encoding cell ${cellId}:`, {
+                    codeEl: !!codeEl,
+                    outputEl: !!outputEl,
+                    codeCollapsed: codeEl ? codeEl.classList.contains('collapsed') : 'N/A',
+                    outputCollapsed: outputEl ? outputEl.classList.contains('collapsed') : 'N/A',
+                    initialState: initialState
+                });
+                
+                if (codeEl || outputEl) {
+                    const state = {};
+                    let hasChanges = false;
+                    
+                    if (codeEl) {
+                        const currentCodeState = codeEl.classList.contains('collapsed') ? 0 : 1;
+                        const initialCodeState = initialState.c;
+                        // Only encode if different from initial state
+                        if (initialCodeState !== undefined && currentCodeState !== initialCodeState) {
+                            state.c = currentCodeState;
+                            hasChanges = true;
+                        }
+                    }
+                    
+                    if (outputEl) {
+                        const currentOutputState = outputEl.classList.contains('collapsed') ? 0 : 1;
+                        const initialOutputState = initialState.o;
+                        // Only encode if different from initial state
+                        if (initialOutputState !== undefined && currentOutputState !== initialOutputState) {
+                            state.o = currentOutputState;
+                            hasChanges = true;
+                        }
+                    }
+                    
+                    // Only include cell if it has changes from initial state
+                    if (hasChanges) {
+                        cellStates[cellId] = state;
+                        console.log(`Added cell ${cellId}:`, state);
+                    }
+                }
+            });
+            
+            console.log('Final cell states to encode:', cellStates);
+            
+            // Return empty string if no changed cells
+            if (Object.keys(cellStates).length === 0) return '';
+            
+            // Encode as compact base64 string
+            const encoded = btoa(JSON.stringify(cellStates));
+            console.log('Encoded cell states:', encoded);
+            return encoded;
+        }
+        
+        function decodeCellStatesFromUrl(encodedStates) {
+            if (!encodedStates) return {};
+            
+            try {
+                return JSON.parse(atob(encodedStates));
+            } catch (e) {
+                console.error('Failed to decode cell states:', e);
+                return {};
+            }
+        }
+        
+        function applyCellStatesFromUrl(cellStates) {
+            console.log('Applying cell states from URL:', cellStates);
+            Object.entries(cellStates).forEach(([cellId, state]) => {
+                const codeEl = document.getElementById('code-' + cellId);
+                const outputEl = document.getElementById('output-' + cellId);
+                
+                console.log(`Cell ${cellId}:`, {
+                    codeEl: !!codeEl,
+                    outputEl: !!outputEl,
+                    state: state
+                });
+                
+                if (codeEl && state.c !== undefined) {
+                    if (state.c === 0) {
+                        codeEl.classList.add('collapsed');
+                        console.log(`Collapsed code for cell ${cellId}`, {
+                            hasCollapsedClass: codeEl.classList.contains('collapsed'),
+                            computedDisplay: getComputedStyle(codeEl).display,
+                            classList: Array.from(codeEl.classList),
+                            elementId: codeEl.id
+                        });
+                    } else {
+                        codeEl.classList.remove('collapsed');
+                        codeEl.classList.add('expanded'); // Explicitly add expanded class
+                        console.log(`Expanded code for cell ${cellId}`, {
+                            hasCollapsedClass: codeEl.classList.contains('collapsed'),
+                            hasExpandedClass: codeEl.classList.contains('expanded'),
+                            computedDisplay: getComputedStyle(codeEl).display,
+                            classList: Array.from(codeEl.classList),
+                            elementId: codeEl.id
+                        });
+                    }
+                }
+                
+                if (outputEl && state.o !== undefined) {
+                    if (state.o === 0) {
+                        outputEl.classList.add('collapsed');
+                        console.log(`Collapsed output for cell ${cellId}`);
+                    } else {
+                        outputEl.classList.remove('collapsed');
+                        console.log(`Expanded output for cell ${cellId}`);
+                    }
+                }
+                
+                // Update visual indicators and force style recalculation
+                try {
+                    updateIndicators(cellId);
+                    // Force browser to recalculate styles
+                    if (codeEl) {
+                        codeEl.offsetHeight; // Force reflow
+                        console.log(`After indicators update - code visible: ${getComputedStyle(codeEl).display !== 'none'}`);
+                    }
+                    if (outputEl) {
+                        outputEl.offsetHeight; // Force reflow
+                        console.log(`After indicators update - output visible: ${getComputedStyle(outputEl).display !== 'none'}`);
+                    }
+                } catch (e) {
+                    console.error(`Error updating indicators for cell ${cellId}:`, e);
+                }
+            });
+        }
+
+        function encodeShapesToUrl() {
+            // Encode shapes as compact base64 string
+            if (_shapes.length === 0) return '';
+            
+            const shapeData = _shapes.map(shape => {
+                const baseData = {
+                    ct: shape.createdAt, // creation timestamp
+                    fo: shape.fadeoutTime || getFadeoutTime() // fadeout time for this shape
+                };
+                
+                if (shape.type === 'arrow') {
+                    return {
+                        ...baseData,
+                        t: 'a',
+                        x1: Math.round(shape.x1),
+                        y1: Math.round(shape.y1),
+                        x2: Math.round(shape.x2),
+                        y2: Math.round(shape.y2),
+                        c: shape.color.substring(1), // remove #
+                        w: shape.width
+                    };
+                } else if (shape.type === 'pen') {
+                    return {
+                        ...baseData,
+                        t: 'p',
+                        pts: shape.points.map(p => [Math.round(p.x), Math.round(p.y)]),
+                        c: shape.color.substring(1),
+                        w: shape.width
+                    };
+                } else if (shape.type === 'spotlight') {
+                    return {
+                        ...baseData,
+                        t: 's',
+                        x: Math.round(shape.x),
+                        y: Math.round(shape.y),
+                        r: Math.round(shape.radius)
+                    };
+                }
+            }).filter(Boolean);
+            
+            return btoa(JSON.stringify(shapeData));
+        }
+        
+        function decodeShapesFromUrl(encodedShapes) {
+            if (!encodedShapes) return [];
+            
+            try {
+                const shapeData = JSON.parse(atob(encodedShapes));
+                return shapeData.map(data => {
+                    const base = {
+                        createdAt: data.ct || Date.now(), // use encoded timestamp or current time
+                        fadeoutTime: data.fo || 0, // use encoded fadeout time or 0 (never fade)
+                        opacity: 1.0
+                    };
+                    
+                    if (data.t === 'a') {
+                        return {
+                            ...base,
+                            type: 'arrow',
+                            x1: data.x1,
+                            y1: data.y1,
+                            x2: data.x2,
+                            y2: data.y2,
+                            color: '#' + data.c,
+                            width: data.w
+                        };
+                    } else if (data.t === 'p') {
+                        return {
+                            ...base,
+                            type: 'pen',
+                            points: data.pts.map(([x, y]) => ({ x, y })),
+                            color: '#' + data.c,
+                            width: data.w
+                        };
+                    } else if (data.t === 's') {
+                        return {
+                            ...base,
+                            type: 'spotlight',
+                            x: data.x,
+                            y: data.y,
+                            radius: data.r,
+                            color: '#000000'
+                        };
+                    }
+                }).filter(Boolean);
+            } catch (e) {
+                console.error('Failed to decode shapes:', e);
+                return [];
+            }
+        }
+        
+        function encodeToolStateToUrl() {
+            // Don't update URL during initialization
+            if (_isInitializing) {
+                return window.location.href;
+            }
+            
+            const params = new URLSearchParams(window.location.search);
+            
+            // Check if tools widget is visible and has an active tool
+            const toolsWidget = document.querySelector('.tools-widget');
+            const activeTool = document.body.dataset.tool;
+            const hasActiveTool = activeTool && activeTool !== 'none';
+            const toolsWidgetVisible = toolsWidget && getComputedStyle(toolsWidget).display !== 'none';
+            
+            // Always handle shapes regardless of tool state
+            const encodedShapes = encodeShapesToUrl();
+            if (encodedShapes) {
+                params.set('shapes', encodedShapes);
+            } else {
+                params.delete('shapes');
+            }
+            
+            // Always preserve existing cell states from URL if present
+            const existingCellStates = params.get('cells');
+            if (existingCellStates) {
+                // Keep existing cell states - don't re-encode from DOM
+                params.set('cells', existingCellStates);
+            } else {
+                // Only encode new cell states if none exist in URL
+                const encodedCellStates = encodeCellStatesToUrl();
+                if (encodedCellStates) {
+                    params.set('cells', encodedCellStates);
+                }
+            }
+            
+            if (toolsWidgetVisible && hasActiveTool) {
+                // Include tool params when widget is visible AND tool is active
+                params.set('tools', '1');
+                params.set('tool', activeTool);
+                
+                // Get color (without # prefix)
+                const color = getArrowColor();
+                if (color && color.startsWith('#')) {
+                    params.set('color', color.substring(1));
+                }
+                
+                // Get thickness
+                const thickness = getLineThickness();
+                params.set('thickness', thickness.toString());
+            } else {
+                // Remove tool state params but keep shapes
+                params.delete('tools');
+                params.delete('tool');
+                params.delete('color');
+                params.delete('thickness');
+                params.delete('fadeout');
+            }
+            
+            // Update URL without reloading
+            const newUrl = window.location.pathname + (params.toString() ? '?' + params.toString() : '') + window.location.hash;
+            window.history.replaceState(null, '', newUrl);
+            
+            return window.location.href;
+        }
+
+        function resetLayout() {
+            try {
+                // Clear all uvnote-* keys
+                const allKeys = Object.keys(localStorage);
+                const uvnoteKeys = allKeys.filter(key => key.startsWith('uvnote-'));
+                uvnoteKeys.forEach(k => localStorage.removeItem(k));
+            } catch (_) {}
+            
+            // Clear any active selection and remove URL params
+            try { clearSelection(true); } catch(_) {}
+            // Reset active tool if any
+            try { window.setActiveTool('none'); } catch(_) {}
+            // Clear shapes
+            try { _shapes = []; saveShapes(); } catch(_) {}
+            // Reset URL-loaded tool flag
+            try { _urlLoadedTool = false; } catch(_) {}
+            // Reset all cells to expanded state
+            try {
+                const cells = document.querySelectorAll('.cell');
+                cells.forEach(cell => {
+                    const cellId = cell.id.replace('cell-', '');
+                    const codeEl = document.getElementById('code-' + cellId);
+                    const outputEl = document.getElementById('output-' + cellId);
+                    if (codeEl) codeEl.classList.remove('collapsed');
+                    if (outputEl) outputEl.classList.remove('collapsed');
+                    updateIndicators(cellId);
+                });
+            } catch(_) {}
+            
+            // Clear ALL URL parameters and reload with clean URL
+            try {
+                const cleanUrl = window.location.pathname + window.location.hash;
+                window.location.href = cleanUrl; // Use window.location.href instead of history.replaceState + reload
+            } catch (_) {
+                // Fallback - reload current page
+                location.reload();
+            }
+        }
+
+        function toggleMenu() {
+            const menuButton = document.querySelector('.menu-button');
+            if (menuButton) {
+                menuButton.classList.toggle('active');
+            }
+        }
+
+        function toggleWidget(widgetName) {
+            let widget;
+            let checkbox;
+            
+            // Close the menu first
+            const menuButton = document.querySelector('.menu-button');
+            if (menuButton) {
+                menuButton.classList.remove('active');
+            }
+            
+            switch(widgetName) {
+                case 'tools':
+                    widget = document.querySelector('.tools-widget');
+                    checkbox = document.getElementById('checkbox-tools');
+                    break;
+                case 'file-explorer':
+                    widget = document.querySelector('.file-explorer');
+                    checkbox = document.getElementById('checkbox-file-explorer');
+                    break;
+                case 'minimap':
+                    widget = document.querySelector('.minimap');
+                    checkbox = document.getElementById('checkbox-minimap');
+                    break;
+                case 'status':
+                    widget = document.querySelector('.status-widget');
+                    checkbox = document.getElementById('checkbox-status');
+                    break;
+                default:
+                    return;
+            }
+            
+            if (widget && checkbox) {
+                const isVisible = getComputedStyle(widget).display !== 'none';
+                widget.style.display = isVisible ? 'none' : 'block';
+                checkbox.textContent = isVisible ? '☐' : '☑';
+                
+                // Save state to localStorage
+                try {
+                    localStorage.setItem(`uvnote-widget-${widgetName}`, isVisible ? 'hidden' : 'visible');
+                } catch (_) {}
+                
+                // Re-layout widgets after visibility change
+                try { 
+                    layoutWidgetsStackedBottomRight(); 
+                } catch (_) {}
+                
+                // Update URL when tools widget visibility changes
+                if (widgetName === 'tools') {
+                    encodeToolStateToUrl();
+                }
+            }
+        }
+        
+        function initializeWidgetVisibility() {
+            const widgets = [
+                { name: 'tools', selector: '.tools-widget' },
+                { name: 'file-explorer', selector: '.file-explorer' },
+                { name: 'minimap', selector: '.minimap' },
+                { name: 'status', selector: '.status-widget' }
+            ];
+            
+            widgets.forEach(({ name, selector }) => {
+                const defaultState = name === 'status' ? 'visible' : 'hidden';
+                const savedState = localStorage.getItem(`uvnote-widget-${name}`) || defaultState;
+                const widget = document.querySelector(selector);
+                const checkbox = document.getElementById(`checkbox-${name}`);
+                
+                if (widget && checkbox) {
+                    const isVisible = savedState === 'visible';
+                    widget.style.display = isVisible ? 'block' : 'none';
+                    checkbox.textContent = isVisible ? '☑' : '☐';
+                }
+            });
+        }
+
+        // Close menu when clicking outside
+        document.addEventListener('click', function(event) {
+            const menuButton = document.querySelector('.menu-button');
+            // Don't close if clicking on a menu item (let the item handler close it)
+            if (menuButton && !menuButton.contains(event.target)) {
+                menuButton.classList.remove('active');
+            }
+        });
+
+        // Layout: stack widgets bottom-right and equalize widths
+        function hasCustomWidgetPositions() {
+            try {
+                return (
+                    localStorage.getItem('uvnote-minimap-pos') ||
+                    localStorage.getItem('uvnote-file-explorer-pos') ||
+                    localStorage.getItem('uvnote-tools-pos')
+                );
+            } catch (_) { return false; }
+        }
+
+        function rectsOverlap(r1, r2) {
+            return !(r1.right <= r2.left || r2.right <= r1.left || r1.bottom <= r2.top || r2.bottom <= r1.top);
+        }
+
+        function widgetsOverlap(widgets) {
+            for (let i = 0; i < widgets.length; i++) {
+                const a = widgets[i];
+                const ra = a.getBoundingClientRect();
+                for (let j = i + 1; j < widgets.length; j++) {
+                    const b = widgets[j];
+                    const rb = b.getBoundingClientRect();
+                    if (rectsOverlap(ra, rb)) return true;
+                }
+            }
+            return false;
+        }
+
+        function applyStackLayout(widgets, order) {
+            if (!widgets.length) return;
+            // Fixed equal width
+            const fixedWidth = 220;
+            widgets.forEach(el => { el.style.width = fixedWidth + 'px'; });
+
+            // Fit heights if needed to avoid overflow
+            const gap = 12;
+            const available = Math.max(0, window.innerHeight - 40 - gap * (order.length - 1));
+            const eachMax = Math.floor(available / order.length);
+            order.forEach(el => {
+                el.style.maxHeight = eachMax + 'px';
+                el.style.overflowY = 'auto';
+            });
+
+            // Stack bottom-up in the requested order
+            let bottomOffset = 20; // base gutter
+            order.forEach(el => {
+                el.style.left = 'auto';
+                el.style.top = 'auto';
+                el.style.right = '20px';
+                el.style.bottom = bottomOffset + 'px';
+                bottomOffset += el.offsetHeight + gap;
+            });
+        }
+
+        function layoutWidgetsStackedBottomRight() {
+            const minimap = document.querySelector('.minimap');
+            const fileExplorer = document.querySelector('.file-explorer');
+            const tools = document.querySelector('.tools-widget');
+            const status = document.querySelector('.status-widget');
+            const widgets = [minimap, fileExplorer, tools, status].filter(el => el && getComputedStyle(el).display !== 'none');
+            if (!widgets.length) return;
+
+            const order = [minimap, fileExplorer, tools, status].filter(Boolean).filter(el => getComputedStyle(el).display !== 'none');
+
+            // If user placed custom positions and there is no overlap, respect them.
+            if (hasCustomWidgetPositions() && !widgetsOverlap(widgets)) return;
+
+            applyStackLayout(widgets, order);
+        }
+        
+        // Panel icon removed
+        
+        let _minimapScrollContainer = null;
+        let _minimapScrollHandler = null;
+        function initMinimap() {
+            // Generate minimap content
+            const minimap = createMinimap();
+            document.body.appendChild(minimap);
+            // Make draggable (use title as handle)
+            const mTitle = minimap.querySelector('.minimap-title');
+            makeDraggable(minimap, 'uvnote-minimap-pos', mTitle);
+
+            // Attach scroll listener to window (two-panel removed)
+            _minimapScrollContainer = window;
+
+            if (_minimapScrollContainer) {
+                _minimapScrollHandler = () => updateMinimapActive();
+                if (_minimapScrollContainer === window) {
+                    window.addEventListener('scroll', _minimapScrollHandler);
+                } else {
+                    _minimapScrollContainer.addEventListener('scroll', _minimapScrollHandler);
+                }
+            }
+            updateMinimapActive();
+        }
+
+        function teardownMinimap() {
+            const minimap = document.querySelector('.minimap');
+            if (minimap && minimap.parentNode) minimap.parentNode.removeChild(minimap);
+            if (_minimapScrollContainer && _minimapScrollHandler) {
+                if (_minimapScrollContainer === window) {
+                    window.removeEventListener('scroll', _minimapScrollHandler);
+                } else {
+                    _minimapScrollContainer.removeEventListener('scroll', _minimapScrollHandler);
+                }
+            }
+            _minimapScrollContainer = null;
+            _minimapScrollHandler = null;
+        }
+        
+        function initFileExplorer() {
+            // Generate file explorer content
+            const fileExplorer = createFileExplorer();
+            document.body.appendChild(fileExplorer);
+        }
+        
+        function createMinimap() {
+            const minimap = document.createElement('div');
+            minimap.className = 'minimap';
+            
+            const title = document.createElement('div');
+            title.className = 'minimap-title';
+            title.textContent = 'navigation';
+            minimap.appendChild(title);
+            
+            // Find all headings and cells
+            const root = document.querySelector('.main-content') || document;
+            const headings = root.querySelectorAll('h1, h2, h3, h4, h5, h6');
+            const cells = root.querySelectorAll('.cell');
+            
+            // Combine and sort by position
+            const items = [];
+            
+            headings.forEach(heading => {
+                const id = heading.id || generateId(heading.textContent);
+                if (!heading.id) heading.id = id;
+                
+                items.push({
+                    element: heading,
+                    type: 'heading',
+                    level: parseInt(heading.tagName.charAt(1)),
+                    text: heading.textContent.trim(),
+                    id: id,
+                    position: heading.getBoundingClientRect().top + window.scrollY
+                });
+            });
+            
+            cells.forEach(cell => {
+                const header = cell.querySelector('.cell-header');
+                if (header) {
+                    const id = cell.id || `cell-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`;
+                    if (!cell.id) cell.id = id;
+                    
+                    items.push({
+                        element: cell,
+                        type: 'cell',
+                        text: header.textContent.trim(),
+                        id: id,
+                        position: cell.getBoundingClientRect().top + window.scrollY
+                    });
+                }
+            });
+            
+            // Sort by position
+            items.sort((a, b) => a.position - b.position);
+            
+            // Create minimap items
+            items.forEach(item => {
+                const link = document.createElement('a');
+                link.className = `minimap-item ${item.type === 'heading' ? 'minimap-heading' : 'minimap-cell'}`;
+                if (item.type === 'heading') {
+                    link.classList.add(`h${item.level}`);
+                }
+                link.textContent = item.text.length > 25 ? item.text.substring(0, 22) + '...' : item.text;
+                link.href = `#${item.id}`;
+                link.onclick = function(e) {
+                    e.preventDefault();
+                    item.element.scrollIntoView({ behavior: 'smooth', block: 'start' });
+                };
+                minimap.appendChild(link);
+            });
+            
+            return minimap;
+        }
+        
+        function generateId(text) {
+            return text.toLowerCase()
+                .replace(/[^a-z0-9]+/g, '-')
+                .replace(/^-+|-+$/g, '')
+                .substring(0, 20);
+        }
+        
+        function updateMinimapActive() {
+            const minimapItems = document.querySelectorAll('.minimap-item');
+            const container = _minimapScrollContainer || window;
+            const containerRect = container === window ? null : container.getBoundingClientRect();
+            const scrollPos = (container === window ? window.scrollY : container.scrollTop) + 100; // Offset for better detection
+            
+            let activeItem = null;
+            minimapItems.forEach(item => {
+                const targetId = item.getAttribute('href').substring(1);
+                const target = document.getElementById(targetId);
+                
+                if (target) {
+                    const rectTop = target.getBoundingClientRect().top;
+                    const targetPos = (container === window)
+                        ? rectTop + window.scrollY
+                        : rectTop - containerRect.top + container.scrollTop;
+                    if (targetPos <= scrollPos) {
+                        activeItem = item;
+                    }
+                }
+                
+                item.classList.remove('active');
+            });
+            
+            if (activeItem) {
+                activeItem.classList.add('active');
+            }
+        }
+        
+        function createFileExplorer() {
+            const fileExplorer = document.createElement('div');
+            fileExplorer.className = 'file-explorer';
+            
+            const title = document.createElement('div');
+            title.className = 'file-explorer-title';
+            title.textContent = 'files';
+            fileExplorer.appendChild(title);
+            // Make draggable (use title as handle)
+            makeDraggable(fileExplorer, 'uvnote-file-explorer-pos', title);
+            
+            // Scripts section
+            const scriptsSection = document.createElement('div');
+            scriptsSection.className = 'file-explorer-section';
+            
+            const scriptsTitle = document.createElement('div');
+            scriptsTitle.className = 'file-explorer-section-title';
+            scriptsTitle.textContent = 'scripts';
+            scriptsSection.appendChild(scriptsTitle);
+            
+            // Find all cells and list their script files (single panel)
+            const root = document.querySelector('.main-content') || document;
+            const cells = root.querySelectorAll('.cell');
+            cells.forEach(cell => {
+                const header = cell.querySelector('.cell-header');
+                if (header) {
+                    const cellText = header.textContent.trim();
+                    const cellMatch = cellText.match(/Cell: ([a-zA-Z_][a-zA-Z0-9_]*)/);
+                    if (cellMatch) {
+                        const cellId = cellMatch[1];
+                        const scriptItem = document.createElement('div');
+                        scriptItem.className = 'file-explorer-item script';
+                        scriptItem.textContent = `${cellId}.py`;
+                        scriptItem.onclick = function() {
+                            cell.scrollIntoView({ behavior: 'smooth', block: 'start' });
+                        };
+                        scriptsSection.appendChild(scriptItem);
+                    }
+                }
+            });
+            
+            fileExplorer.appendChild(scriptsSection);
+            
+            // Artifacts section
+            const artifactsSection = document.createElement('div');
+            artifactsSection.className = 'file-explorer-section';
+            
+            const artifactsTitle = document.createElement('div');
+            artifactsTitle.className = 'file-explorer-section-title';
+            artifactsTitle.textContent = 'artifacts';
+            artifactsSection.appendChild(artifactsTitle);
+            
+            // Find all artifact links (single panel)
+            const artifactsRoot = document.querySelector('.main-content') || document;
+            const artifacts = artifactsRoot.querySelectorAll('.artifact');
+            if (artifacts.length === 0) {
+                const noArtifacts = document.createElement('div');
+                noArtifacts.className = 'file-explorer-item artifact';
+                noArtifacts.textContent = '(none)';
+                noArtifacts.style.opacity = '0.5';
+                artifactsSection.appendChild(noArtifacts);
+            } else {
+                artifacts.forEach(artifact => {
+                    const artifactItem = document.createElement('div');
+                    artifactItem.className = 'file-explorer-item artifact';
+                    artifactItem.textContent = artifact.textContent;
+                    artifactItem.onclick = function() {
+                        artifact.click();
+                    };
+                    artifactsSection.appendChild(artifactItem);
+                });
+            }
+            
+            fileExplorer.appendChild(artifactsSection);
+            
+            return fileExplorer;
+        }
+
+        function initStatusWidget() {
+            let el = document.querySelector('.status-widget');
+            if (!el) {
+                el = document.createElement('div');
+                el.className = 'status-widget';
+                el.id = 'status-widget';
+                el.textContent = 'ready — Esc';
+                document.body.appendChild(el);
+            }
+        }
+
+        // Tools widget
+        let _cursorX = 0;
+        let _cursorY = 0;
+        let _cursorVisible = false;
+        
+        function setActiveTool(tool) {
+            if (!tool || tool === 'none') {
+                document.body.dataset.tool = 'none';
+                localStorage.setItem('uvnote-active-tool', 'none');
+                setOverlayActive(false);
+                _cursorVisible = false;
+                // Remove active class from all tool buttons when deactivating
+                const toolButtons = document.querySelectorAll('.tools-widget .tool-button');
+                toolButtons.forEach(btn => btn.classList.remove('active'));
+                updateStateIndicator();
+                encodeToolStateToUrl();
+                return;
+            }
+            document.body.dataset.tool = tool;
+            localStorage.setItem('uvnote-active-tool', tool);
+            setOverlayActive(true);
+            _cursorVisible = true;
+            updateStateIndicator();
+            encodeToolStateToUrl();
+        }
+
+        // Make setActiveTool globally accessible for ESC key handler
+        window.setActiveTool = setActiveTool;
+        
+
+        function getArrowColor() {
+            const saved = localStorage.getItem('uvnote-arrow-color');
+            if (saved) return saved;
+            return '#e53935'; // Default red color
+        }
+
+        function setStoredArrowColor(color) {
+            try { localStorage.setItem('uvnote-arrow-color', color); } catch (_) {}
+        }
+
+        function getLineThickness() {
+            const saved = localStorage.getItem('uvnote-line-thickness');
+            if (saved) return parseInt(saved, 10);
+            return 6; // default thickness
+        }
+
+        function setStoredLineThickness(thickness) {
+            try { localStorage.setItem('uvnote-line-thickness', thickness); } catch (_) {}
+        }
+
+        function getFadeoutTime() {
+            const saved = localStorage.getItem('uvnote-fadeout-time');
+            if (saved) return parseInt(saved, 10);
+            return 5; // default 5 seconds
+        }
+
+        function setStoredFadeoutTime(seconds) {
+            try { localStorage.setItem('uvnote-fadeout-time', seconds); } catch (_) {}
+        }
+
+        function createToolsWidget() {
+            const tools = document.createElement('div');
+            tools.className = 'tools-widget';
+
+            const title = document.createElement('div');
+            title.className = 'tools-title';
+            title.textContent = 'tools';
+            tools.appendChild(title);
+
+            const row = document.createElement('div');
+            row.className = 'tools-row';
+            tools.appendChild(row);
+
+            // Arrow tool
+            const arrowBtn = document.createElement('div');
+            arrowBtn.className = 'tool-button';
+            arrowBtn.textContent = 'arrow';
+            arrowBtn.onclick = function() {
+                const isActive = arrowBtn.classList.contains('active');
+                if (isActive) {
+                    arrowBtn.classList.remove('active');
+                    setActiveTool('none');
+                } else {
+                    tools.querySelectorAll('.tool-button').forEach(b => b.classList.remove('active'));
+                    arrowBtn.classList.add('active');
+                    setActiveTool('arrow');
+                }
+            };
+            row.appendChild(arrowBtn);
+
+            // Pen tool
+            const penBtn = document.createElement('div');
+            penBtn.className = 'tool-button';
+            penBtn.textContent = 'pen';
+            penBtn.onclick = function() {
+                const isActive = penBtn.classList.contains('active');
+                if (isActive) {
+                    penBtn.classList.remove('active');
+                    setActiveTool('none');
+                } else {
+                    tools.querySelectorAll('.tool-button').forEach(b => b.classList.remove('active'));
+                    penBtn.classList.add('active');
+                    setActiveTool('pen');
+                }
+            };
+            row.appendChild(penBtn);
+
+            // Eraser tool
+            const eraseBtn = document.createElement('div');
+            eraseBtn.className = 'tool-button';
+            eraseBtn.textContent = 'eraser';
+            eraseBtn.onclick = function() {
+                const isActive = eraseBtn.classList.contains('active');
+                if (isActive) {
+                    eraseBtn.classList.remove('active');
+                    setActiveTool('none');
+                } else {
+                    tools.querySelectorAll('.tool-button').forEach(b => b.classList.remove('active'));
+                    eraseBtn.classList.add('active');
+                    setActiveTool('eraser');
+                }
+            };
+            row.appendChild(eraseBtn);
+
+            // Spotlight tool
+            const spotlightBtn = document.createElement('div');
+            spotlightBtn.className = 'tool-button';
+            spotlightBtn.textContent = 'spotlight';
+            spotlightBtn.onclick = function() {
+                const isActive = spotlightBtn.classList.contains('active');
+                if (isActive) {
+                    spotlightBtn.classList.remove('active');
+                    setActiveTool('none');
+                } else {
+                    tools.querySelectorAll('.tool-button').forEach(b => b.classList.remove('active'));
+                    spotlightBtn.classList.add('active');
+                    setActiveTool('spotlight');
+                }
+            };
+            row.appendChild(spotlightBtn);
+
+            // Clear all
+            const clearBtn = document.createElement('div');
+            clearBtn.className = 'tool-button';
+            clearBtn.textContent = 'clear';
+            clearBtn.onclick = function() {
+                _shapes = [];
+                saveShapes();
+                renderOverlay();
+            };
+            row.appendChild(clearBtn);
+            
+            // We'll add the copy button at the end of the widget
+
+            // Restore active state from storage
+            const saved = localStorage.getItem('uvnote-active-tool') || 'none';
+            if (saved === 'arrow') {
+                arrowBtn.classList.add('active');
+                setActiveTool('arrow');
+            } else if (saved === 'pen') {
+                penBtn.classList.add('active');
+                setActiveTool('pen');
+            } else if (saved === 'eraser') {
+                eraseBtn.classList.add('active');
+                setActiveTool('eraser');
+            } else if (saved === 'spotlight') {
+                spotlightBtn.classList.add('active');
+                setActiveTool('spotlight');
+            }
+
+            // Color selector
+            const colorTitle = document.createElement('div');
+            colorTitle.className = 'tools-section-title';
+            colorTitle.textContent = 'color';
+            tools.appendChild(colorTitle);
+
+            const colorRow = document.createElement('div');
+            colorRow.className = 'tools-row color-row';
+            tools.appendChild(colorRow);
+
+            const swatchColors = [
+                // Primary colors
+                '#e53935', '#fb8c00', '#fdd835', '#43a047', '#1e88e5', '#8e24aa',
+                // Additional useful colors  
+                '#ff5722', '#795548', '#607d8b', '#9c27b0',
+                // Grayscale
+                '#000000', '#424242', '#9e9e9e', '#ffffff'
+            ];
+            const swatches = [];
+            swatchColors.forEach(c => {
+                const s = document.createElement('div');
+                s.className = 'color-swatch';
+                s.style.backgroundColor = c;
+                s.title = c;
+                s.onclick = () => {
+                    setStoredArrowColor(c);
+                    refreshColorUI(c);
+                    if (_cursorVisible) renderOverlay();
+                    encodeToolStateToUrl();
+                };
+                colorRow.appendChild(s);
+                swatches.push(s);
+            });
+
+            const colorInput = document.createElement('input');
+            colorInput.type = 'color';
+            colorInput.className = 'color-input';
+            colorInput.oninput = () => {
+                setStoredArrowColor(colorInput.value);
+                refreshColorUI(colorInput.value);
+                if (_cursorVisible) renderOverlay();
+                encodeToolStateToUrl();
+            };
+            colorRow.appendChild(colorInput);
+
+            function refreshColorUI(selected) {
+                const selectedHex = selected.startsWith('#') ? selected.toLowerCase() : rgbToHex(selected);
+                
+                swatches.forEach((s, i) => {
+                    const swatchHex = swatchColors[i].toLowerCase();
+                    if (swatchHex === selectedHex) {
+                        s.classList.add('selected');
+                    } else {
+                        s.classList.remove('selected');
+                    }
+                });
+                
+                try { 
+                    colorInput.value = selectedHex; 
+                } catch (_) {}
+            }
+
+            function rgbToHex(rgb) {
+                const m = rgb.match(/rgba?\((\d+),\s*(\d+),\s*(\d+)\)/i);
+                if (!m) return '#000000';
+                const r = parseInt(m[1]).toString(16).padStart(2, '0');
+                const g = parseInt(m[2]).toString(16).padStart(2, '0');
+                const b = parseInt(m[3]).toString(16).padStart(2, '0');
+                return `#${r}${g}${b}`;
+            }
+
+            // Restore color selection
+            refreshColorUI(getArrowColor());
+
+            // Thickness slider
+            const thicknessTitle = document.createElement('div');
+            thicknessTitle.className = 'tools-section-title';
+            thicknessTitle.textContent = 'thickness';
+            tools.appendChild(thicknessTitle);
+
+            const thicknessRow = document.createElement('div');
+            thicknessRow.className = 'thickness-row';
+            tools.appendChild(thicknessRow);
+
+            const thicknessSlider = document.createElement('input');
+            thicknessSlider.type = 'range';
+            thicknessSlider.className = 'thickness-slider';
+            thicknessSlider.min = '1';
+            thicknessSlider.max = '10';
+            thicknessSlider.value = getLineThickness();
+            
+            const thicknessValue = document.createElement('span');
+            thicknessValue.className = 'thickness-value';
+            thicknessValue.textContent = thicknessSlider.value + 'px';
+
+            thicknessSlider.oninput = function() {
+                const value = parseInt(thicknessSlider.value, 10);
+                setStoredLineThickness(value);
+                thicknessValue.textContent = value + 'px';
+                if (_cursorVisible) renderOverlay();
+                encodeToolStateToUrl();
+            };
+
+            thicknessRow.appendChild(thicknessSlider);
+            thicknessRow.appendChild(thicknessValue);
+
+            // Fadeout time slider
+            const fadeoutTitle = document.createElement('div');
+            fadeoutTitle.className = 'tools-section-title';
+            fadeoutTitle.textContent = 'fadeout time';
+            tools.appendChild(fadeoutTitle);
+
+            const fadeoutRow = document.createElement('div');
+            fadeoutRow.className = 'thickness-row';
+            tools.appendChild(fadeoutRow);
+
+            const fadeoutSlider = document.createElement('input');
+            fadeoutSlider.type = 'range';
+            fadeoutSlider.className = 'thickness-slider';
+            fadeoutSlider.min = '0';
+            fadeoutSlider.max = '30';
+            fadeoutSlider.value = getFadeoutTime();
+            
+            const fadeoutValue = document.createElement('span');
+            fadeoutValue.className = 'thickness-value';
+            fadeoutValue.textContent = fadeoutSlider.value === '0' ? 'never' : fadeoutSlider.value + 's';
+
+            fadeoutSlider.oninput = function() {
+                const value = parseInt(fadeoutSlider.value, 10);
+                setStoredFadeoutTime(value);
+                fadeoutValue.textContent = value === 0 ? 'never' : value + 's';
+                encodeToolStateToUrl();
+            };
+
+            fadeoutRow.appendChild(fadeoutSlider);
+            fadeoutRow.appendChild(fadeoutValue);
+
+            // Draggable behavior
+            makeDraggable(tools, 'uvnote-tools-pos', title);
+
+            return tools;
+        }
+
+        function initTools() {
+            const widget = createToolsWidget();
+            document.body.appendChild(widget);
+        }
+
+        function teardownTools() {
+            const w = document.querySelector('.tools-widget');
+            if (w && w.parentNode) w.parentNode.removeChild(w);
+        }
+
+        // --- Canvas overlay for tools ---
+        let _overlay = null;
+        let _overlayCtx = null;
+        let _overlayContainer = null; // window
+        let _overlayMode = 'single';
+        let _overlayResizeHandler = null;
+        let _overlayScrollHandler = null;
+        let _drawing = null; // current in-progress arrow {x1,y1,x2,y2}
+        let _shapes = []; // committed shapes for current mode
+        let _fadeTimer = null; // timer for fade animation
+        let _urlLoadedTool = false; // track if tool was loaded from URL
+        let _isInitializing = true; // prevent URL updates during initialization
+        let _initialCellStates = {}; // track initial cell states from page load
+
+        function getOverlayStorageKey() { return 'uvnote-shapes'; }
+
+        function loadShapes() {
+            try {
+                const raw = localStorage.getItem(getOverlayStorageKey());
+                _shapes = raw ? JSON.parse(raw) : [];
+            } catch (_) { _shapes = []; }
+        }
+
+        function saveShapes() {
+            try { 
+                localStorage.setItem(getOverlayStorageKey(), JSON.stringify(_shapes));
+                // Always update URL when shapes change
+                encodeToolStateToUrl();
+            } catch (_) {}
+        }
+
+        function updateShapesFade() {
+            const now = Date.now();
+            let needsUpdate = false;
+
+            for (let i = _shapes.length - 1; i >= 0; i--) {
+                const shape = _shapes[i];
+                if (!shape.createdAt) continue; // Skip old shapes without timestamps
+                
+                // Use individual shape's fadeout time, or global if not set
+                const shapesFadeoutSeconds = shape.fadeoutTime !== undefined ? shape.fadeoutTime : getFadeoutTime();
+                
+                // Skip fading if fadeout is disabled for this shape
+                if (shapesFadeoutSeconds === 0) continue;
+                
+                const fadeStartTime = Math.max(0, (shapesFadeoutSeconds - 2) * 1000); // Start fading 2s before end
+                const fadeEndTime = shapesFadeoutSeconds * 1000; // Fully gone after specified time
+                const age = now - shape.createdAt;
+                
+                if (age >= fadeEndTime) {
+                    // Remove completely faded shapes
+                    _shapes.splice(i, 1);
+                    needsUpdate = true;
+                } else if (age >= fadeStartTime) {
+                    // Update opacity for fading shapes
+                    const fadeProgress = (age - fadeStartTime) / (fadeEndTime - fadeStartTime);
+                    const newOpacity = 1 - fadeProgress;
+                    if (Math.abs(shape.opacity - newOpacity) > 0.01) {
+                        shape.opacity = newOpacity;
+                        needsUpdate = true;
+                    }
+                }
+            }
+
+            if (needsUpdate) {
+                saveShapes();
+                renderOverlay();
+                // Update URL to remove faded shapes  
+                encodeToolStateToUrl();
+            }
+        }
+
+        function getContentContainer() { return window; }
+
+        function updateOverlayModeAndContainer() {
+            _overlayContainer = window;
+            _overlayMode = 'single';
+        }
+
+        function updateOverlayBounds() {
+            if (!_overlay) return;
+            if (_overlayContainer === window) {
+                _overlay.style.position = 'fixed';
+                _overlay.style.left = '0px';
+                _overlay.style.top = '0px';
+                _overlay.width = window.innerWidth;
+                _overlay.height = window.innerHeight;
+            } else {
+                const rect = _overlayContainer.getBoundingClientRect();
+                _overlay.style.position = 'fixed';
+                _overlay.style.left = rect.left + 'px';
+                _overlay.style.top = rect.top + 'px';
+                _overlay.width = Math.max(0, Math.floor(rect.width));
+                _overlay.height = Math.max(0, Math.floor(rect.height));
+            }
+            renderOverlay();
+        }
+
+        function containerScrollLeft() {
+            return (_overlayContainer === window) ? (window.scrollX || 0) : (_overlayContainer.scrollLeft || 0);
+        }
+        function containerScrollTop() {
+            return (_overlayContainer === window) ? (window.scrollY || 0) : (_overlayContainer.scrollTop || 0);
+        }
+
+        function toCanvasCoords(clientX, clientY) {
+            const rect = _overlay.getBoundingClientRect();
+            return { x: clientX - rect.left, y: clientY - rect.top };
+        }
+
+        function onPointerDown(e) {
+            const tool = document.body.dataset.tool;
+            if (tool === 'arrow') {
+                startDrawArrow(e);
+            } else if (tool === 'pen') {
+                startDrawPen(e);
+            } else if (tool === 'eraser') {
+                eraseAt(e);
+            } else if (tool === 'spotlight') {
+                startDrawSpotlight(e);
+            }
+        }
+
+        function onPointerMove(e) {
+            // Update cursor position
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _cursorX = pt.x;
+            _cursorY = pt.y;
+            
+            if (!_drawing) {
+                // Just update cursor position and re-render
+                if (_cursorVisible) {
+                    renderOverlay();
+                }
+                return;
+            }
+            
+            if (_drawing.type === 'pen') {
+                moveDrawPen(e);
+            } else if (_drawing.type === 'spotlight') {
+                moveDrawSpotlight(e);
+            } else {
+                moveDrawArrow(e);
+            }
+        }
+        
+        function onPointerEnter(e) {
+            _cursorVisible = document.body.dataset.tool !== 'none';
+            if (_cursorVisible) {
+                renderOverlay();
+            }
+        }
+        
+        function onPointerLeave(e) {
+            _cursorVisible = false;
+            renderOverlay();
+        }
+
+        function onPointerUp(e) {
+            if (!_drawing) return;
+            if (_drawing.type === 'pen') {
+                endDrawPen();
+            } else if (_drawing.type === 'spotlight') {
+                endDrawSpotlight();
+            } else {
+                endDrawArrow();
+            }
+        }
+
+        function startDrawArrow(e) {
+            if (document.body.dataset.tool !== 'arrow') return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _drawing = {
+                x1: pt.x + containerScrollLeft(),
+                y1: pt.y + containerScrollTop(),
+                x2: pt.x + containerScrollLeft(),
+                y2: pt.y + containerScrollTop(),
+                color: getArrowColor(),
+                width: getLineThickness()
+            };
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function moveDrawArrow(e) {
+            if (!_drawing) return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _drawing.x2 = pt.x + containerScrollLeft();
+            _drawing.y2 = pt.y + containerScrollTop();
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function endDrawArrow() {
+            if (!_drawing) return;
+            _shapes.push({ 
+                type: 'arrow', 
+                ..._drawing,
+                createdAt: Date.now(),
+                fadeoutTime: getFadeoutTime(),
+                opacity: 1.0
+            });
+            _drawing = null;
+            saveShapes();
+            renderOverlay();
+        }
+
+        function startDrawPen(e) {
+            if (document.body.dataset.tool !== 'pen') return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _drawing = {
+                type: 'pen',
+                points: [{
+                    x: pt.x + containerScrollLeft(),
+                    y: pt.y + containerScrollTop()
+                }],
+                color: getArrowColor(),
+                width: getLineThickness()
+            };
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function moveDrawPen(e) {
+            if (!_drawing || _drawing.type !== 'pen') return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _drawing.points.push({
+                x: pt.x + containerScrollLeft(),
+                y: pt.y + containerScrollTop()
+            });
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function endDrawPen() {
+            if (!_drawing || _drawing.type !== 'pen') return;
+            if (_drawing.points.length > 1) {
+                _shapes.push({ 
+                    ..._drawing,
+                    createdAt: Date.now(),
+                    fadeoutTime: getFadeoutTime(),
+                    opacity: 1.0
+                });
+            }
+            _drawing = null;
+            saveShapes();
+            renderOverlay();
+        }
+
+        function startDrawSpotlight(e) {
+            if (document.body.dataset.tool !== 'spotlight') return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _drawing = {
+                type: 'spotlight',
+                x: pt.x + containerScrollLeft(),
+                y: pt.y + containerScrollTop(),
+                radius: getLineThickness() * 20, // Use thickness to control spotlight size (bigger default)
+                color: getArrowColor()
+            };
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function moveDrawSpotlight(e) {
+            if (!_drawing || _drawing.type !== 'spotlight') return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            const dx = pt.x + containerScrollLeft() - _drawing.x;
+            const dy = pt.y + containerScrollTop() - _drawing.y;
+            _drawing.radius = Math.max(20, Math.sqrt(dx * dx + dy * dy)); // Minimum radius of 20
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function endDrawSpotlight() {
+            if (!_drawing || _drawing.type !== 'spotlight') return;
+            _shapes.push({ 
+                ..._drawing,
+                createdAt: Date.now(),
+                fadeoutTime: getFadeoutTime(),
+                opacity: 1.0
+            });
+            _drawing = null;
+            saveShapes();
+            renderOverlay();
+        }
+
+        function distPointToSegment(px, py, x1, y1, x2, y2) {
+            const dx = x2 - x1, dy = y2 - y1;
+            if (dx === 0 && dy === 0) return Math.hypot(px - x1, py - y1);
+            const t = Math.max(0, Math.min(1, ((px - x1) * dx + (py - y1) * dy) / (dx*dx + dy*dy)));
+            const cx = x1 + t * dx, cy = y1 + t * dy;
+            return Math.hypot(px - cx, py - cy);
+        }
+
+        function eraseAt(e) {
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            const x = pt.x + containerScrollLeft();
+            const y = pt.y + containerScrollTop();
+            const threshold = 10; // pixels
+            for (let i = _shapes.length - 1; i >= 0; i--) {
+                const s = _shapes[i];
+                if (s.type === 'arrow') {
+                    const d = distPointToSegment(x, y, s.x1, s.y1, s.x2, s.y2);
+                    if (d <= threshold) {
+                        _shapes.splice(i, 1);
+                        saveShapes();
+                        renderOverlay();
+                        break;
+                    }
+                } else if (s.type === 'pen' && s.points) {
+                    // Check if click is near any line segment in the pen stroke
+                    let minDist = Infinity;
+                    for (let j = 1; j < s.points.length; j++) {
+                        const d = distPointToSegment(x, y, s.points[j-1].x, s.points[j-1].y, s.points[j].x, s.points[j].y);
+                        minDist = Math.min(minDist, d);
+                    }
+                    if (minDist <= threshold) {
+                        _shapes.splice(i, 1);
+                        saveShapes();
+                        renderOverlay();
+                        break;
+                    }
+                }
+            }
+            e.preventDefault();
+        }
+
+        function drawArrow(ctx, x1, y1, x2, y2, color, width, opacity = 1.0) {
+            // Set opacity
+            const oldAlpha = ctx.globalAlpha;
+            ctx.globalAlpha = opacity;
+            
+            ctx.strokeStyle = color;
+            ctx.fillStyle = color;
+            ctx.lineWidth = width;
+            ctx.lineCap = 'round';
+            ctx.lineJoin = 'round';
+            
+            // Check if points are too close (initial state)
+            const dx = x2 - x1;
+            const dy = y2 - y1;
+            const distance = Math.sqrt(dx * dx + dy * dy);
+            
+            if (distance < 5) {
+                // Draw just a small arrowhead pointing down-right when first clicked
+                const defaultAngle = Math.PI / 4; // 45 degrees (down-right)
+                const headLength = Math.min(15 + width * 1.5, 25);
+                const headAngle = Math.PI / 6;
+                
+                // Calculate arrowhead points
+                const hx1 = x1 + headLength * Math.cos(defaultAngle - headAngle);
+                const hy1 = y1 + headLength * Math.sin(defaultAngle - headAngle);
+                const hx2 = x1 + headLength * Math.cos(defaultAngle + headAngle);
+                const hy2 = y1 + headLength * Math.sin(defaultAngle + headAngle);
+                
+                // Draw arrowhead only
+                ctx.beginPath();
+                ctx.moveTo(x1, y1);
+                ctx.lineTo(hx1, hy1);
+                ctx.lineTo(hx2, hy2);
+                ctx.closePath();
+                ctx.fill();
+            } else {
+                // Normal arrow drawing - head at x1,y1, tail at x2,y2
+                const angle = Math.atan2(y1 - y2, x1 - x2);
+                const headLength = Math.min(15 + width * 1.5, 25);
+                const headAngle = Math.PI / 6;
+                
+                // Calculate where the line should end (before the arrowhead)
+                const lineEndX = x1 - headLength * 0.8 * Math.cos(angle);
+                const lineEndY = y1 - headLength * 0.8 * Math.sin(angle);
+                
+                // Draw the line from tail to near the head
+                ctx.beginPath();
+                ctx.moveTo(x2, y2);
+                ctx.lineTo(lineEndX, lineEndY);
+                ctx.stroke();
+                
+                // Calculate arrowhead points
+                const hx1 = x1 - headLength * Math.cos(angle - headAngle);
+                const hy1 = y1 - headLength * Math.sin(angle - headAngle);
+                const hx2 = x1 - headLength * Math.cos(angle + headAngle);
+                const hy2 = y1 - headLength * Math.sin(angle + headAngle);
+                
+                // Draw arrowhead
+                ctx.beginPath();
+                ctx.moveTo(x1, y1);
+                ctx.lineTo(hx1, hy1);
+                ctx.lineTo(hx2, hy2);
+                ctx.closePath();
+                ctx.fill();
+            }
+            
+            // Restore opacity
+            ctx.globalAlpha = oldAlpha;
+        }
+
+        function drawPen(ctx, points, color, width, offX, offY, opacity = 1.0) {
+            if (!points || points.length < 2) return;
+            
+            // Set opacity
+            const oldAlpha = ctx.globalAlpha;
+            ctx.globalAlpha = opacity;
+            
+            ctx.strokeStyle = color;
+            ctx.lineWidth = width;
+            ctx.lineCap = 'round';
+            ctx.lineJoin = 'round';
+            ctx.beginPath();
+            ctx.moveTo(points[0].x - offX, points[0].y - offY);
+            for (let i = 1; i < points.length; i++) {
+                ctx.lineTo(points[i].x - offX, points[i].y - offY);
+            }
+            ctx.stroke();
+            
+            // Restore opacity
+            ctx.globalAlpha = oldAlpha;
+        }
+
+        function drawAllSpotlights(ctx, spotlights, offX, offY) {
+            if (!spotlights || spotlights.length === 0) return;
+            
+            ctx.save();
+            
+            // Calculate the overall opacity based on all spotlights
+            const maxOpacity = Math.max(...spotlights.map(s => s.opacity || 1.0));
+            
+            // Fill entire canvas with dark overlay
+            ctx.fillStyle = `rgba(0, 0, 0, ${0.7 * maxOpacity})`;
+            ctx.fillRect(0, 0, ctx.canvas.width, ctx.canvas.height);
+            
+            // Cut out completely transparent holes for all spotlights
+            ctx.globalCompositeOperation = 'destination-out';
+            ctx.fillStyle = 'rgba(0, 0, 0, 1)'; // Solid black to ensure complete removal
+            for (const spotlight of spotlights) {
+                ctx.beginPath();
+                ctx.arc(spotlight.x - offX, spotlight.y - offY, spotlight.radius, 0, 2 * Math.PI);
+                ctx.fill();
+            }
+            
+            ctx.restore();
+        }
+
+        function renderOverlay() {
+            if (!_overlay || !_overlayCtx) return;
+            _overlayCtx.clearRect(0, 0, _overlay.width, _overlay.height);
+            const offX = containerScrollLeft();
+            const offY = containerScrollTop();
+            // Draw non-spotlight shapes first
+            for (const s of _shapes) {
+                const opacity = s.opacity !== undefined ? s.opacity : 1.0;
+                if (s.type === 'arrow') {
+                    drawArrow(_overlayCtx, s.x1 - offX, s.y1 - offY, s.x2 - offX, s.y2 - offY, s.color || '#f00', s.width || 2, opacity);
+                } else if (s.type === 'pen') {
+                    drawPen(_overlayCtx, s.points, s.color || '#f00', s.width || 2, offX, offY, opacity);
+                }
+            }
+            // Draw current drawing (non-spotlight)
+            if (_drawing) {
+                if (_drawing.type === 'pen') {
+                    drawPen(_overlayCtx, _drawing.points, _drawing.color, _drawing.width, offX, offY);
+                } else if (_drawing.type !== 'spotlight') {
+                    drawArrow(_overlayCtx, _drawing.x1 - offX, _drawing.y1 - offY, _drawing.x2 - offX, _drawing.y2 - offY, _drawing.color, _drawing.width);
+                }
+            }
+            
+            // Collect all spotlights (existing + current drawing + cursor preview)
+            const spotlights = [];
+            
+            // Add existing spotlight shapes
+            for (const s of _shapes) {
+                if (s.type === 'spotlight') {
+                    spotlights.push({
+                        x: s.x,
+                        y: s.y, 
+                        radius: s.radius,
+                        opacity: s.opacity !== undefined ? s.opacity : 1.0
+                    });
+                }
+            }
+            
+            // Add current spotlight being drawn
+            if (_drawing && _drawing.type === 'spotlight') {
+                spotlights.push({
+                    x: _drawing.x,
+                    y: _drawing.y,
+                    radius: _drawing.radius,
+                    opacity: 1.0
+                });
+            }
+            
+            // Add cursor preview spotlight if tool is active
+            if (_cursorVisible && !_drawing) {
+                const tool = document.body.dataset.tool;
+                if (tool === 'spotlight') {
+                    const thickness = getLineThickness();
+                    const radius = thickness * 20;
+                    const cursorWorldX = _cursorX + containerScrollLeft();
+                    const cursorWorldY = _cursorY + containerScrollTop();
+                    spotlights.push({
+                        x: cursorWorldX,
+                        y: cursorWorldY,
+                        radius: radius,
+                        opacity: 0.8
+                    });
+                }
+            }
+            
+            // Draw all spotlights as a single overlay with multiple holes
+            drawAllSpotlights(_overlayCtx, spotlights, offX, offY);
+            
+            // Draw cursor indicators for non-spotlight tools
+            if (_cursorVisible && !_drawing) {
+                const tool = document.body.dataset.tool;
+                const color = getArrowColor();
+                const thickness = getLineThickness();
+                
+                if (tool !== 'spotlight') {
+                    _overlayCtx.save();
+                    _overlayCtx.fillStyle = color;
+                    _overlayCtx.globalAlpha = 0.7;
+                    
+                    if (tool === 'eraser') {
+                        // Draw eraser indicator
+                        _overlayCtx.strokeStyle = color;
+                        _overlayCtx.lineWidth = 2;
+                        _overlayCtx.beginPath();
+                        _overlayCtx.arc(_cursorX, _cursorY, 10, 0, 2 * Math.PI);
+                        _overlayCtx.stroke();
+                    } else {
+                        // Draw dot for pen/arrow
+                        _overlayCtx.beginPath();
+                        _overlayCtx.arc(_cursorX, _cursorY, thickness / 2, 0, 2 * Math.PI);
+                        _overlayCtx.fill();
+                    }
+                    
+                    _overlayCtx.restore();
+                }
+            }
+        }
+
+        function setOverlayActive(active) {
+            if (!_overlay) initOverlay();
+            _overlay.style.pointerEvents = active ? 'auto' : 'none';
+            _overlay.style.cursor = active ? 'none' : 'auto';
+            // Re-render to ensure visibility aligns with content
+            renderOverlay();
+        }
+
+        function initOverlay() {
+            if (_overlay) return;
+            updateOverlayModeAndContainer();
+            _overlay = document.createElement('canvas');
+            _overlay.className = 'draw-overlay';
+            _overlayCtx = _overlay.getContext('2d');
+            document.body.appendChild(_overlay);
+            updateOverlayBounds();
+            loadShapes();
+            renderOverlay();
+
+            // Events
+            _overlay.addEventListener('mousedown', onPointerDown);
+            _overlay.addEventListener('mousemove', onPointerMove);
+            _overlay.addEventListener('mouseenter', onPointerEnter);
+            _overlay.addEventListener('mouseleave', onPointerLeave);
+            document.addEventListener('mouseup', onPointerUp);
+            _overlay.addEventListener('touchstart', onPointerDown, { passive: false });
+            _overlay.addEventListener('touchmove', onPointerMove, { passive: false });
+            document.addEventListener('touchend', onPointerUp);
+
+            _overlayResizeHandler = () => updateOverlayBounds();
+            window.addEventListener('resize', _overlayResizeHandler);
+
+            _overlayScrollHandler = () => renderOverlay();
+            window.addEventListener('scroll', _overlayScrollHandler);
+            
+            // Start fade animation timer
+            _fadeTimer = setInterval(updateShapesFade, 100); // Update every 100ms for smooth fade
+        }
+
+        function rebindOverlayContainer() {
+            if (!_overlay) return;
+            // Remove old scroll handler
+            if (_overlayScrollHandler) { window.removeEventListener('scroll', _overlayScrollHandler); }
+            updateOverlayModeAndContainer();
+            updateOverlayBounds();
+            loadShapes();
+            renderOverlay();
+            _overlayScrollHandler = () => renderOverlay();
+            window.addEventListener('scroll', _overlayScrollHandler);
+        }
+
+        function teardownOverlay() {
+            if (!_overlay) return;
+            _overlay.removeEventListener('mousedown', onPointerDown);
+            _overlay.removeEventListener('mousemove', onPointerMove);
+            _overlay.removeEventListener('mouseenter', onPointerEnter);
+            _overlay.removeEventListener('mouseleave', onPointerLeave);
+            document.removeEventListener('mouseup', onPointerUp);
+            _overlay.removeEventListener('touchstart', onPointerDown);
+            _overlay.removeEventListener('touchmove', onPointerMove);
+            document.removeEventListener('touchend', onPointerUp);
+            if (_overlayResizeHandler) window.removeEventListener('resize', _overlayResizeHandler);
+            if (_overlayScrollHandler) {
+                if (_overlayContainer === window) {
+                    window.removeEventListener('scroll', _overlayScrollHandler);
+                } else if (_overlayContainer) {
+                    _overlayContainer.removeEventListener('scroll', _overlayScrollHandler);
+                }
+            }
+            if (_fadeTimer) {
+                clearInterval(_fadeTimer);
+                _fadeTimer = null;
+            }
+            if (_overlay.parentNode) _overlay.parentNode.removeChild(_overlay);
+            _overlay = null; _overlayCtx = null; _overlayContainer = null; _overlayResizeHandler = null; _overlayScrollHandler = null; _drawing = null;
+        }
+        
+        function teardownFileExplorer() {
+            const fe = document.querySelector('.file-explorer');
+            if (fe && fe.parentNode) fe.parentNode.removeChild(fe);
+        }
+
+        function escapeHtml(text) {
+            const div = document.createElement('div');
+            div.textContent = text;
+            return div.innerHTML;
+        }
+
+        function runCell(cellId){
+            const btn=document.querySelector('.run-btn[onclick*="'+cellId+'"]');
+            const output=document.getElementById('output-'+cellId);
+            if(btn){btn.textContent='⏳ running...';btn.disabled=true;}
+            if(output){output.classList.add('output-stale');}
+            fetch('/run/'+cellId,{method:'POST'}).then(r=>r.json()).then(data=>{
+                if(output){
+                    output.classList.remove('output-stale');
+                    let html='';
+                    if (data.stdout) {
+                    html += '<div class="cell-stdout"><pre class="stdout-text">'
+                        + escapeHtml(data.stdout)
+                        + '</pre></div>';
+                    }
+
+                    console.log('UV Logs:', data);
+                    if(data.stderr) {
+                        // Split UV logs from regular stderr
+                        const lines = data.stderr.split('\\n');
+                        let uvLogs = [];
+                        let regularLogs = [];
+                        let inUvSection = true;
+
+                        for (const line of lines) {
+                            if (inUvSection) {
+                                uvLogs.push(line);
+                                if (line.startsWith('Installed ')) {
+                                    inUvSection = false;
+                                }
+                            } else {
+                                regularLogs.push(line);
+                            }
+                        }
+                        
+
+                        // If we never found "Installed", treat it all as regular stderr
+                        if (inUvSection) {
+                            html+='<div class="cell-stderr">'+escapeHtml(data.stderr)+'</div>';
+                        } else {
+                            const uvLogsStr = uvLogs.join('\\n');
+                            const regularLogsStr = regularLogs.join('\\n').trim();
+
+                            if (uvLogsStr) {
+                                html+='<div class="uv-install-logs">';
+                                html+='<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>';
+                                html+='<div class="uv-logs-content" style="display: none;">'+escapeHtml(uvLogsStr)+'</div>';
+                                html+='</div>';
+                            }
+                            if (regularLogsStr) {
+                                html+='<div class="cell-stderr">'+escapeHtml(regularLogsStr)+'</div>';
+                            }
+                        }
+                    }
+                    output.innerHTML=html;
+                }
+                if(btn){btn.textContent='▶ run';btn.disabled=false;}
+            }).catch(e=>{
+                console.error('Run failed:',e);
+                if(output){output.classList.remove('output-stale');}
+                if(btn){btn.textContent='▶ run';btn.disabled=false;}
+            });
+        }
+
+        function copyCell(cellId){
+            // Try multiple selectors to find the code element
+            // Pygments generates .highlight > pre with spans, not wrapped in <code>
+            let codeElement = document.querySelector('#code-'+cellId+' .highlight pre');
+            if (!codeElement) {
+                codeElement = document.querySelector('#code-'+cellId+' pre');
+            }
+            if (!codeElement) {
+                codeElement = document.querySelector('#code-'+cellId+' code');
+            }
+            if (!codeElement) {
+                // Fallback to the code div itself
+                codeElement = document.getElementById('code-'+cellId);
+            }
+
+            const btn = document.querySelector('.copy-btn[onclick*="'+cellId+'"]');
+
+            if (!codeElement) {
+                console.error('Code element not found for cell:', cellId);
+                return;
+            }
+            if (!btn) {
+                console.error('Copy button not found for cell:', cellId);
+                return;
+            }
+
+            const codeText = codeElement.textContent;
+            
+            if (navigator.clipboard && navigator.clipboard.writeText) {
+                navigator.clipboard.writeText(codeText).then(function() {
+                    console.log('Clipboard copy successful');
+                    btn.textContent = '✓ Copied!';
+                    btn.classList.add('copied');
+                    setTimeout(function() {
+                        btn.textContent = 'Copy';
+                        btn.classList.remove('copied');
+                    }, 2000);
+                }).catch(function(err) {
+                    console.warn('Clipboard copy failed:', err);
+                    fallbackCopy();
+                });
+            } else {
+                console.log('Using fallback copy method');
+                fallbackCopy();
+            }
+            
+            function fallbackCopy() {
+                const textarea = document.createElement('textarea');
+                textarea.value = codeText;
+                textarea.style.position = 'absolute';
+                textarea.style.left = '-9999px';
+                document.body.appendChild(textarea);
+                textarea.select();
+                try {
+                    const success = document.execCommand('copy');
+                    console.log('Fallback copy success:', success);
+                    btn.textContent = '✓ Copied!';
+                    btn.classList.add('copied');
+                    setTimeout(function() {
+                        btn.textContent = 'Copy';
+                        btn.classList.remove('copied');
+                    }, 2000);
+                } catch (err) {
+                    console.error('Fallback copy failed:', err);
+                    btn.textContent = 'Copy failed';
+                    setTimeout(function() {
+                        btn.textContent = 'Copy';
+                    }, 2000);
+                }
+                document.body.removeChild(textarea);
+            }
+        }
+
+        // // Live reload functionality (robust SSE handling)
+        // (function(){
+        //     if (!('EventSource' in window)) {
+        //         console.warn('SSE not supported in this browser');
+        //         return;
+        //     }
+        //     let source = new EventSource('/events');
+        //     let isOpen = false;
+        //     source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
+        //     source.onmessage = function(e){
+        //         const msg=(e.data||'').trim(); if(!msg) return;
+        //         console.log('SSE message:', msg);
+        //         if (msg==='reload' || msg==='incremental') { location.reload(); }
+        //         // Ignore 'loading' to avoid premature reload loops
+        //     };
+        //     source.onerror = function(e){
+        //         // Let EventSource auto-reconnect instead of forcing a reload
+        //         if (isOpen) console.warn('SSE error after open, retrying...', e);
+        //     };
+        //     window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
+        // })();
+
+
+        document.addEventListener('DOMContentLoaded', function() {
+            // Capture initial cell states before any modifications
+            captureInitialCellStates();
+            
+            updateThemeIcon();
+            updateUiMenu();
+            updateUiDebug();
+            const widgetsEnabled = (document.documentElement.getAttribute('data-widgets') || 'on') === 'on';
+            if (widgetsEnabled) {
+            initMinimap();
+            initFileExplorer();
+            initTools();
+            initOverlay();
+            initStatusWidget();
+            initializeWidgetVisibility();
+            layoutWidgetsStackedBottomRight();
+            window.addEventListener('resize', layoutWidgetsStackedBottomRight);
+            }
+
+            // Apply deep-link selection if present
+            applyLocationFromUrl();
+            updateStateIndicator();
+            
+            // Apply cell states from URL immediately
+            const url = new URL(window.location.href);
+            const encodedCellStates = url.searchParams.get('cells');
+            if (encodedCellStates) {
+                console.log('Applying cell states from URL...');
+                const cellStates = decodeCellStatesFromUrl(encodedCellStates);
+                
+                // Use requestAnimationFrame to ensure DOM is ready
+                requestAnimationFrame(() => {
+                    applyCellStatesFromUrl(cellStates);
+                    
+                    // Clear initialization flag after cell states are applied
+                    if (typeof _isInitializing !== 'undefined') {
+                        _isInitializing = false;
+                    }
+                });
+            } else {
+                // Clear initialization flag even if no cell states
+                if (typeof _isInitializing !== 'undefined') {
+                    requestAnimationFrame(() => {
+                        _isInitializing = false;
+                    });
+                }
+            }
+
+            // Bind drag selection on line numbers
+            document.addEventListener('mousedown', onLineNumberMouseDown);
+            document.addEventListener('mousemove', onDocMouseMove);
+            document.addEventListener('mouseup', onDocMouseUp);
+
+            // Add ESC key handler to exit tools
+            document.addEventListener('keydown', function(e) {
+                if (e.key === 'Escape' || e.keyCode === 27) {
+                    const currentTool = document.body.dataset.tool;
+                    if (currentTool && currentTool !== 'none') {
+                        // Deactivate the current tool
+                        window.setActiveTool('none');
+                    }
+                    // Also clear any active line selection
+                    clearSelection(true);
+                }
+            });
+        });
+
+        function updateStateIndicator() {
+            try {
+                const el = document.getElementById('status-widget');
+                if (!el) return;
+                const tool = document.body.dataset.tool || 'none';
+                if (tool && tool !== 'none') {
+                    el.textContent = `tool: ${tool} — Esc`;
+                    return;
+                }
+                if (_selection) {
+                    const t = _selection.a === _selection.b ? `L${_selection.a}` : `L${_selection.a}-${_selection.b}`;
+                    el.textContent = `selected: ${t} — Esc`;
+                    return;
+                }
+                el.textContent = 'ready — Esc';
+            } catch (_) {}
+        }
+    </script>
 </head>
+
+
 <body>
-  <h1>Index of /</h1>
-  <ul>
-    <li><a href='activation/index.html' class='dir'>activation/</a></li>
-    <li><a href='causal_conv1d/index.html' class='dir'>causal_conv1d/</a></li>
-    <li><a href='flash_attn/index.html' class='dir'>flash_attn/</a></li>
-    <li><a href='layer_norm/index.html' class='dir'>layer_norm/</a></li>
-    <li><a href='rotary/index.html' class='dir'>rotary/</a></li>
-  </ul>
+    <div class="controls">
+        <div class="controls-buttons">
+            
+            <a href="index.html" class="back-button">← back</a>
+            
+            <div class="theme-toggle" onclick="toggleTheme()">light</div>
+            <div class="reset-toggle" onclick="resetLayout()">reset</div>
+            <div class="menu-button" onclick="toggleMenu()">
+                menu ▼
+                <div class="menu-dropdown">
+                    <div class="menu-item" onclick="setUiTheme('default')">
+                        <span class="menu-checkbox" id="checkbox-ui-default">☑</span> Theme: default
+                    </div>
+                    <div class="menu-item" onclick="setUiTheme('none')">
+                        <span class="menu-checkbox" id="checkbox-ui-none">☐</span> Theme: none
+                    </div>
+                    <div class="menu-item" onclick="setUiTheme('monocolor')">
+                        <span class="menu-checkbox" id="checkbox-ui-monocolor">☐</span> Theme: monocolor
+                    </div>
+                    <div class="menu-item" onclick="toggleWidget('tools')">
+                        <span class="menu-checkbox" id="checkbox-tools">☐</span> Tools
+                    </div>
+                    <div class="menu-item" onclick="toggleWidget('file-explorer')">
+                        <span class="menu-checkbox" id="checkbox-file-explorer">☐</span> File Explorer
+                    </div>
+                    <div class="menu-item" onclick="toggleWidget('minimap')">
+                        <span class="menu-checkbox" id="checkbox-minimap">☐</span> Table of Contents
+                    </div>
+                    <div class="menu-item" onclick="toggleWidget('status')">
+                        <span class="menu-checkbox" id="checkbox-status">☑</span> Status Indicator
+                    </div>
+                </div>
+            </div>
+        </div>
+    </div>
+    
+    <div class="system-info">
+        <div class="system-info-header">Generated on:</div>
+        <div class="system-info-content">
+            Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
+        </div>
+    </div>
+    
+    <div class="main-content">
+        <h1>All Benchmarks Aggregated Report</h1>
+<h2><a href="layer_norm/">Layer Norm</a></h2>
+<div class="artifact-preview">
+<object data="layer_norm/results/artifacts/combine/latency.svg" type="image/svg+xml" width="800">
+</object>
+</div>
+
+<table>
+<thead>
+<tr>
+<th>Implementation</th>
+<th>Description</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><a href="layer_norm/impls/hf_kernels_layer_norm.html">HF Kernels Layer Norm</a></td>
+<td>HuggingFace kernels implementation</td>
+</tr>
+<tr>
+<td><a href="layer_norm/impls/torch_layer_norm.html">PyTorch Layer Norm</a></td>
+<td>PyTorch native implementation</td>
+</tr>
+</tbody>
+</table>
+<h2><a href="rotary/">Rotary Position Embeddings</a></h2>
+<div class="artifact-preview">
+<object data="rotary/results/artifacts/combine/latency.svg" type="image/svg+xml" width="800">
+</object>
+</div>
+
+<table>
+<thead>
+<tr>
+<th>Implementation</th>
+<th>Description</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><a href="rotary/impls/hf_kernels_rotary.html">HF Kernels Rotary</a></td>
+<td>HuggingFace kernels implementation</td>
+</tr>
+<tr>
+<td><a href="rotary/impls/torch_rotary.html">PyTorch Rotary</a></td>
+<td>PyTorch native implementation</td>
+</tr>
+</tbody>
+</table>
+<h2><a href="flash_attn/">Flash Attention</a></h2>
+<div class="artifact-preview">
+<object data="flash_attn/results/artifacts/combine/latency.svg" type="image/svg+xml" width="800">
+</object>
+</div>
+
+<table>
+<thead>
+<tr>
+<th>Implementation</th>
+<th>Description</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><a href="flash_attn/impls/flash_attention.html">Flash Attention</a></td>
+<td>Flash Attention implementation</td>
+</tr>
+<tr>
+<td><a href="flash_attn/impls/hf_kernels_flash_attn.html">HF Kernels Flash Attention</a></td>
+<td>HuggingFace kernels Flash Attention</td>
+</tr>
+<tr>
+<td><a href="flash_attn/impls/hf_kernels_flash_attn3.html">HF Kernels Flash Attention 3</a></td>
+<td>HuggingFace kernels Flash Attention 3</td>
+</tr>
+<tr>
+<td><a href="flash_attn/impls/mem_efficient_attention.html">Memory Efficient Attention</a></td>
+<td>Memory efficient attention implementation</td>
+</tr>
+<tr>
+<td><a href="flash_attn/impls/sage_attention.html">Sage Attention</a></td>
+<td>Sage attention implementation</td>
+</tr>
+<tr>
+<td><a href="flash_attn/impls/xformers.html">xFormers</a></td>
+<td>xFormers attention implementation</td>
+</tr>
+</tbody>
+</table>
+<h2><a href="causal_conv1d/">Causal Conv1D</a></h2>
+<div class="artifact-preview">
+<object data="causal_conv1d/results/artifacts/combine/latency.svg" type="image/svg+xml" width="800">
+</object>
+</div>
+
+<table>
+<thead>
+<tr>
+<th>Implementation</th>
+<th>Description</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><a href="causal_conv1d/impls/hf_kernels_causal_conv1d.html">HF Kernels Causal Conv1D</a></td>
+<td>HuggingFace kernels implementation</td>
+</tr>
+<tr>
+<td><a href="causal_conv1d/impls/torch_causal_conv1d.html">PyTorch Causal Conv1D</a></td>
+<td>PyTorch native implementation</td>
+</tr>
+</tbody>
+</table>
+<h2><a href="activation/">Activation</a></h2>
+<div class="artifact-preview">
+<object data="activation/results/artifacts/combine/latency.svg" type="image/svg+xml" width="800">
+</object>
+</div>
+
+<table>
+<thead>
+<tr>
+<th>Implementation</th>
+<th>Description</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><a href="activation/impls/hf_kernels_swiglu.html">HF Kernels SwiGLU</a></td>
+<td>HuggingFace kernels SwiGLU implementation</td>
+</tr>
+<tr>
+<td><a href="activation/impls/torch_swiglu.html">PyTorch SwiGLU</a></td>
+<td>PyTorch native SwiGLU implementation</td>
+</tr>
+</tbody>
+</table>
+<h2><a href="relu/">ReLU</a></h2>
+<div class="artifact-preview">
+<object data="relu/results/artifacts/combine/latency.svg" type="image/svg+xml" width="800">
+</object>
+</div>
+
+<table>
+<thead>
+<tr>
+<th>Implementation</th>
+<th>Description</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><a href="relu/impls/hf_kernels_relu.html">HF Kernels ReLU</a></td>
+<td>HuggingFace kernels ReLU implementation</td>
+</tr>
+<tr>
+<td><a href="relu/impls/torch_relu.html">PyTorch ReLU</a></td>
+<td>PyTorch native ReLU implementation</td>
+</tr>
+</tbody>
+</table>
+    </div>
+    
 </body>
 </html>
\ No newline at end of file
diff --git a/layer_norm/impls/artifacts/benchmark/layer_norm.jsonl b/layer_norm/impls/artifacts/benchmark/layer_norm.jsonl
index fcd809d60a69166f4be7343612f4f810d256a506..611975ecd9585a8b6f1198e5f9cf417087baa85d 100644
--- a/layer_norm/impls/artifacts/benchmark/layer_norm.jsonl
+++ b/layer_norm/impls/artifacts/benchmark/layer_norm.jsonl
@@ -1,4 +1,4 @@
-{"ts": "2025-10-28T14:08:59Z", "run": "c74bca3f7fd14f779c98e3d8b69c0098", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D4096", "batch": 16, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8268990000033227, "p50": 0.8360890000176369, "p90": 0.8378790000733716, "mean": 0.8358750000070359, "iqr": 0.002010000116570154, "raw_times": [0.8426389999840467, 0.8268990000033227, 0.8378790000733716, 0.8360890000176369, 0.8358689999568014], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8452999999235544, "peak_bytes": 2415935488, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
-{"ts": "2025-10-28T14:09:00Z", "run": "c74bca3f7fd14f779c98e3d8b69c0098", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D8192", "batch": 16, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6477070000746608, "p50": 1.6516379999984565, "p90": 1.6565669999408783, "mean": 1.6533151999965412, "iqr": 0.006360999918797461, "raw_times": [1.6565669999408783, 1.6516379999984565, 1.6477070000746608, 1.6604579999466296, 1.6502060000220808], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6544470000781075, "peak_bytes": 4831870976, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_ref"}, "err": null}
-{"ts": "2025-10-28T14:09:00Z", "run": "c74bca3f7fd14f779c98e3d8b69c0098", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D4096", "batch": 16, "seq_len": 4096, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6441269999631913, "p50": 1.6532669999378413, "p90": 1.6534970000066096, "mean": 1.6500411999913922, "iqr": 0.009149999982582813, "raw_times": [1.6441269999631913, 1.6534970000066096, 1.6532669999378413, 1.6443470000240268, 1.654968000025292], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6665570000213847, "peak_bytes": 4831854592, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
-{"ts": "2025-10-28T14:09:00Z", "run": "c74bca3f7fd14f779c98e3d8b69c0098", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D8192", "batch": 16, "seq_len": 4096, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.251962999911484, "p50": 3.270412999995642, "p90": 3.2735430000911947, "mean": 3.2660931999998866, "iqr": 0.01632000009976764, "raw_times": [3.2735430000911947, 3.251962999911484, 3.257222999991427, 3.277324000009685, 3.270412999995642], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.2640430000583365, "peak_bytes": 9663709184, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1026859283447266e-05, "ref": "layer_norm_ref"}, "err": null}
+{"ts": "2025-10-29T14:26:39Z", "run": "a26c482e532b445fa4192ef14ec4850f", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D4096", "batch": 16, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8322699999894212, "p50": 0.8364899999833142, "p90": 0.8382409999967422, "mean": 0.8359703999872181, "iqr": 0.0036810000096920703, "raw_times": [0.8322699999894212, 0.8382909999795629, 0.8345599999870501, 0.8382409999967422, 0.8364899999833142], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8381600000006983, "peak_bytes": 2415935488, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
+{"ts": "2025-10-29T14:26:39Z", "run": "a26c482e532b445fa4192ef14ec4850f", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D8192", "batch": 16, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6435499999829517, "p50": 1.6499199999771008, "p90": 1.6516499999852385, "mean": 1.650240399987979, "iqr": 0.0024989999474200886, "raw_times": [1.6516499999852385, 1.6435499999829517, 1.6499199999771008, 1.6491510000378184, 1.656930999956785], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.644769999984419, "peak_bytes": 4831870976, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_ref"}, "err": null}
+{"ts": "2025-10-29T14:26:39Z", "run": "a26c482e532b445fa4192ef14ec4850f", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D4096", "batch": 16, "seq_len": 4096, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6425610000396773, "p50": 1.6517310000381258, "p90": 1.654420999955164, "mean": 1.6505027999983213, "iqr": 0.006990999963818467, "raw_times": [1.6474299999913455, 1.6517310000381258, 1.654420999955164, 1.6563709999672938, 1.6425610000396773], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6471609999939574, "peak_bytes": 4831854592, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
+{"ts": "2025-10-29T14:26:39Z", "run": "a26c482e532b445fa4192ef14ec4850f", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D8192", "batch": 16, "seq_len": 4096, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.237169999977141, "p50": 3.2576509999557857, "p90": 3.264301000001524, "mean": 3.257706599993071, "iqr": 0.008230999981151399, "raw_times": [3.2576509999557857, 3.264301000001524, 3.2733410000105323, 3.2560700000203724, 3.237169999977141], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.2725309999932506, "peak_bytes": 9663709184, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1026859283447266e-05, "ref": "layer_norm_ref"}, "err": null}
diff --git a/layer_norm/impls/hf_kernels_layer_norm.html b/layer_norm/impls/hf_kernels_layer_norm.html
index cfec5b11856445875be968b8022bf0064c0ca56f..9e9cf8da940eb80e201b94351f6e97b42048c103 100644
--- a/layer_norm/impls/hf_kernels_layer_norm.html
+++ b/layer_norm/impls/hf_kernels_layer_norm.html
@@ -3862,8 +3862,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
     </div>
     
     <div class="main-content">
-        <h2>on_github: huggingface/kernels-uvnotes</h2>
-<h1>HF Kernels LayerNorm Implementation</h1>
+        <h1>HF Kernels LayerNorm Implementation</h1>
 <p>Based on kernels-community <code>layer-norm</code> kernel.</p>
 <h2>LayerNorm Benchmark (HF Kernels)</h2>
 <div class="cell" id="cell-benchmark">
@@ -3873,10 +3872,11 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
 </span> | 
-Cell: benchmark | 7.03s
+Cell: benchmark | 6.34s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
+<a href="https://github.com/huggingface/kernels-uvnotes/blob/main/layer_norm/impls/hf_kernels_layer_norm.md" target="_blank" class="github-btn">GitHub</a>
 </div>
 <div id="code-benchmark" class="cell-code" data-lines="49">
 <div class="code-wrap">
@@ -3943,19 +3943,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D4096
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm         4.56%     180.575us        46.01%       1.822ms       1.822ms       0.000us         0.00%       3.098ms       3.098ms             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         1.70%      67.272us        40.91%       1.619ms     539.829us       2.362ms       100.00%       3.098ms       1.033ms             3  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       2.363ms       100.06%       2.363ms       2.363ms             1  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us       2.362ms       100.00%       2.362ms     787.305us             3  
-                                Activity Buffer Request        36.75%       1.455ms        36.75%       1.455ms       1.455ms     736.127us        31.17%     736.127us     736.127us             1  
-                                             aten::view         0.54%      21.512us         0.54%      21.512us       3.585us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         1.17%      46.231us         1.17%      46.231us       5.137us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.23%       9.070us         0.23%       9.070us       3.023us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         1.06%      41.913us         1.06%      41.913us      13.971us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        53.99%       2.137ms        53.99%       2.137ms       2.137ms       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm         5.26%     209.855us        46.73%       1.864ms       1.864ms       0.000us         0.00%       3.097ms       3.097ms             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         1.78%      70.832us        40.86%       1.630ms     543.337us       2.360ms       100.00%       3.097ms       1.032ms             3  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       2.362ms       100.06%       2.362ms       2.362ms             1  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us       2.360ms       100.00%       2.360ms     786.699us             3  
+                                Activity Buffer Request        36.61%       1.461ms        36.61%       1.461ms       1.461ms     736.736us        31.22%     736.736us     736.736us             1  
+                                             aten::view         0.61%      24.271us         0.61%      24.271us       4.045us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         1.19%      47.642us         1.19%      47.642us       5.294us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.27%      10.789us         0.27%      10.789us       3.596us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.01%      40.102us         1.01%      40.102us      13.367us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        53.27%       2.125ms        53.27%       2.125ms       2.125ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 3.959ms
-Self CUDA time total: 2.362ms
+Self CPU time total: 3.989ms
+Self CUDA time total: 2.360ms
 
 
 
@@ -3965,19 +3965,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D8192
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm         2.19%     144.024us        30.18%       1.989ms       1.989ms       0.000us         0.00%       6.322ms       6.322ms             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         0.69%      45.641us        27.80%       1.832ms     610.764us       4.774ms       100.00%       6.322ms       2.107ms             3  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       4.776ms       100.03%       4.776ms       4.776ms             1  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us       4.774ms       100.00%       4.774ms       1.591ms             3  
-                                Activity Buffer Request        26.09%       1.720ms        26.09%       1.720ms       1.720ms       1.548ms        32.42%       1.548ms       1.548ms             1  
-                                             aten::view         0.20%      12.871us         0.20%      12.871us       2.145us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         0.50%      32.981us         0.50%      32.981us       3.665us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.07%       4.881us         0.07%       4.881us       1.627us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         0.44%      29.151us         0.44%      29.151us       9.717us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        69.82%       4.602ms        69.82%       4.602ms       4.602ms       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm         2.24%     143.733us        27.27%       1.751ms       1.751ms       0.000us         0.00%       6.440ms       6.440ms             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         0.75%      48.181us        24.84%       1.595ms     531.669us       4.846ms       100.00%       6.440ms       2.147ms             3  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       4.848ms       100.03%       4.848ms       4.848ms             1  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us       4.846ms       100.00%       4.846ms       1.615ms             3  
+                                Activity Buffer Request        23.08%       1.482ms        23.08%       1.482ms       1.482ms       1.594ms        32.88%       1.594ms       1.594ms             1  
+                                             aten::view         0.20%      12.572us         0.20%      12.572us       2.095us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         0.46%      29.840us         0.46%      29.840us       3.316us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.08%       5.420us         0.08%       5.420us       1.807us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         0.46%      29.490us         0.46%      29.490us       9.830us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        72.73%       4.670ms        72.73%       4.670ms       4.670ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 6.591ms
-Self CUDA time total: 4.774ms
+Self CPU time total: 6.421ms
+Self CUDA time total: 4.846ms
 
 
 
@@ -3987,19 +3987,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D4096
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm         1.89%     121.823us        28.69%       1.852ms       1.852ms       0.000us         0.00%       6.323ms       6.323ms             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         0.69%      44.435us        26.61%       1.718ms     572.663us       4.766ms       100.00%       6.323ms       2.108ms             3  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       4.767ms       100.03%       4.767ms       4.767ms             1  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us       4.766ms       100.00%       4.766ms       1.589ms             3  
-                                Activity Buffer Request        24.91%       1.608ms        24.91%       1.608ms       1.608ms       1.557ms        32.67%       1.557ms       1.557ms             1  
-                                             aten::view         0.19%      12.441us         0.19%      12.441us       2.074us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         0.50%      32.030us         0.50%      32.030us       3.559us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.08%       4.850us         0.08%       4.850us       1.617us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         0.44%      28.190us         0.44%      28.190us       9.397us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        71.31%       4.604ms        71.31%       4.604ms       4.604ms       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm         1.96%     126.465us        27.43%       1.766ms       1.766ms       0.000us         0.00%       6.435ms       6.435ms             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         0.73%      46.779us        25.26%       1.627ms     542.360us       4.838ms       100.00%       6.435ms       2.145ms             3  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       4.839ms       100.03%       4.839ms       4.839ms             1  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us       4.838ms       100.00%       4.838ms       1.613ms             3  
+                                Activity Buffer Request        23.54%       1.516ms        23.54%       1.516ms       1.516ms       1.597ms        33.01%       1.597ms       1.597ms             1  
+                                             aten::view         0.20%      12.929us         0.20%      12.929us       2.155us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         0.46%      29.911us         0.46%      29.911us       3.323us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.08%       5.300us         0.08%       5.300us       1.767us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         0.45%      29.003us         0.45%      29.003us       9.668us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        72.57%       4.674ms        72.57%       4.674ms       4.674ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 6.457ms
-Self CUDA time total: 4.766ms
+Self CPU time total: 6.440ms
+Self CUDA time total: 4.838ms
 
 
 
@@ -4009,37 +4009,38 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D8192
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm         1.32%     150.697us        17.31%       1.975ms       1.975ms       0.000us         0.00%      12.822ms      12.822ms             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         0.42%      47.993us        15.87%       1.810ms     603.497us       9.629ms       100.00%      12.822ms       4.274ms             3  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       9.631ms       100.01%       9.631ms       9.631ms             1  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us       9.629ms       100.00%       9.629ms       3.210ms             3  
-                                Activity Buffer Request        12.56%       1.433ms        12.56%       1.433ms       1.433ms       3.193ms        33.16%       3.193ms       3.193ms             1  
-                                             aten::view         0.12%      13.330us         0.12%      13.330us       2.222us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         0.28%      32.431us         0.28%      32.431us       3.603us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.05%       5.260us         0.05%       5.260us       1.753us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         2.56%     291.579us         2.56%     291.579us      97.193us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        82.69%       9.436ms        82.69%       9.436ms       9.436ms       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm         1.17%     134.085us        17.09%       1.957ms       1.957ms       0.000us         0.00%      12.886ms      12.886ms             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         0.41%      46.869us        15.80%       1.809ms     603.015us       9.665ms       100.00%      12.886ms       4.295ms             3  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       9.667ms       100.01%       9.667ms       9.667ms             1  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us       9.665ms       100.00%       9.665ms       3.222ms             3  
+                                Activity Buffer Request        12.76%       1.462ms        12.76%       1.462ms       1.462ms       3.220ms        33.32%       3.220ms       3.220ms             1  
+                                             aten::view         0.12%      13.968us         0.12%      13.968us       2.328us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         0.26%      30.043us         0.26%      30.043us       3.338us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.05%       5.590us         0.05%       5.590us       1.863us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         2.31%     264.797us         2.31%     264.797us      88.266us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        82.91%       9.495ms        82.91%       9.495ms       9.495ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 11.410ms
-Self CUDA time total: 9.629ms
+Self CPU time total: 11.452ms
+Self CUDA time total: 9.665ms
 
 
 impl                     wl                  p50(ms)  ok
 hf_kernels_layer_norm    LN_B16_S2048_D4096     0.84  True
 hf_kernels_layer_norm    LN_B16_S2048_D8192     1.65  True
 hf_kernels_layer_norm    LN_B16_S4096_D4096     1.65  True
-hf_kernels_layer_norm    LN_B16_S4096_D8192     3.27  True
+hf_kernels_layer_norm    LN_B16_S4096_D8192     3.26  True
 </pre></div>
 <div class="uv-install-logs" id="uv-logs-benchmark">
 <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 <div class="uv-logs-content" style="display: none;">
+Downloading hf-xet (3.2MiB)
+ Downloading hf-xet
 Installed 15 packages in 13ms
 </div>
 </div>
 <div class="cell-stderr">Fetching 4 files:   0%|          | 0/4 [00:00&lt;?, ?it/s]
-Fetching 4 files:  25%|██▌       | 1/4 [00:00&lt;00:00,  8.47it/s]
-Fetching 4 files:  50%|█████     | 2/4 [00:02&lt;00:02,  1.44s/it]
-Fetching 4 files: 100%|██████████| 4/4 [00:02&lt;00:00,  1.61it/s]</div>
+Fetching 4 files:  50%|█████     | 2/4 [00:01&lt;00:01,  1.22it/s]
+Fetching 4 files: 100%|██████████| 4/4 [00:01&lt;00:00,  2.44it/s]</div>
 <div class="cell-artifacts">
 <h4>Artifacts:</h4>
 <a href="artifacts/benchmark/layer_norm.jsonl" class="artifact" target="_blank">layer_norm.jsonl</a>
diff --git a/layer_norm/impls/torch_layer_norm.html b/layer_norm/impls/torch_layer_norm.html
index 72ce43dc70edcb0cbcced09b58a31530fadba3d8..f5dd45a5ed15040ec9f80c48eca459fb67a1bc56 100644
--- a/layer_norm/impls/torch_layer_norm.html
+++ b/layer_norm/impls/torch_layer_norm.html
@@ -3862,8 +3862,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
     </div>
     
     <div class="main-content">
-        <h2>on_github: huggingface/kernels-uvnotes</h2>
-<h1>Torch LayerNorm Implementation</h1>
+        <h1>Torch LayerNorm Implementation</h1>
 <h2>GPU Info</h2>
 <div class="cell" id="cell-nv">
 <div class="cell-header">
@@ -3872,10 +3871,11 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
 <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> | 
-Cell: nv | 0.22s
+Cell: nv | 0.26s
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
 <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
+<a href="https://github.com/huggingface/kernels-uvnotes/blob/main/layer_norm/impls/torch_layer_norm.md" target="_blank" class="github-btn">GitHub</a>
 </div>
 <div id="code-nv" class="cell-code" data-lines="2">
 <div class="code-wrap">
@@ -3887,7 +3887,7 @@ Cell: nv | 0.22s
 </div>
 </div>
 <div id="output-nv" class="cell-output">
-<div class="cell-stdout"><pre class="stdout-text">Tue Oct 28 14:08:35 2025       
+<div class="cell-stdout"><pre class="stdout-text">Wed Oct 29 14:26:26 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -3896,7 +3896,7 @@ Cell: nv | 0.22s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   31C    P0            141W /  350W |       0MiB /  46068MiB |     21%      Default |
+| N/A   30C    P0            108W /  350W |       0MiB /  46068MiB |    100%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3920,10 +3920,11 @@ Cell: nv | 0.22s
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
 </span> | 
-Cell: benchmark | 7.39s
+Cell: benchmark | 7.36s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
+<a href="https://github.com/huggingface/kernels-uvnotes/blob/main/layer_norm/impls/torch_layer_norm.md" target="_blank" class="github-btn">GitHub</a>
 </div>
 <div id="code-benchmark" class="cell-code" data-lines="26">
 <div class="code-wrap">
@@ -3967,19 +3968,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S2048_D4096
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         3.94%     153.126us        46.06%       1.791ms       1.791ms       0.000us         0.00%       3.027ms       3.027ms             1  
-                                       aten::layer_norm         0.44%      17.151us        42.12%       1.638ms     545.972us       0.000us         0.00%       3.027ms       1.009ms             3  
-                                aten::native_layer_norm         1.99%      77.265us        41.68%       1.621ms     540.255us       2.317ms       100.00%       3.027ms       1.009ms             3  
+                                       torch_layer_norm         3.90%     151.572us        46.01%       1.786ms       1.786ms       0.000us         0.00%       3.026ms       3.026ms             1  
+                                       aten::layer_norm         0.43%      16.762us        42.11%       1.635ms     544.851us       0.000us         0.00%       3.026ms       1.009ms             3  
+                                aten::native_layer_norm         2.06%      80.009us        41.67%       1.618ms     539.263us       2.316ms       100.00%       3.026ms       1.009ms             3  
                                        torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       2.318ms       100.06%       2.318ms       2.318ms             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       2.317ms       100.00%       2.317ms     772.230us             3  
-                                Activity Buffer Request        37.14%       1.444ms        37.14%       1.444ms       1.444ms     709.980us        30.65%     709.980us     709.980us             1  
-                                            aten::empty         1.21%      46.960us         1.21%      46.960us       5.218us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         1.16%      45.271us         1.16%      45.271us      15.090us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.18%       7.130us         0.18%       7.130us       1.188us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        53.94%       2.098ms        53.94%       2.098ms       2.098ms       0.000us         0.00%       0.000us       0.000us             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       2.316ms       100.00%       2.316ms     772.127us             3  
+                                Activity Buffer Request        37.08%       1.440ms        37.08%       1.440ms       1.440ms     709.855us        30.65%     709.855us     709.855us             1  
+                                            aten::empty         1.19%      46.261us         1.19%      46.261us       5.140us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         1.16%      45.163us         1.16%      45.163us      15.054us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.17%       6.761us         0.17%       6.761us       1.127us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        53.99%       2.096ms        53.99%       2.096ms       2.096ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 3.889ms
-Self CUDA time total: 2.317ms
+Self CPU time total: 3.882ms
+Self CUDA time total: 2.316ms
 
 
 
@@ -3989,19 +3990,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S2048_D8192
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         1.11%      71.092us        25.40%       1.622ms       1.622ms       0.000us         0.00%       6.494ms       6.494ms             1  
-                                       aten::layer_norm         0.16%      10.119us        24.29%       1.551ms     517.038us       0.000us         0.00%       6.494ms       2.165ms             3  
-                                aten::native_layer_norm         0.82%      52.103us        24.13%       1.541ms     513.665us       4.898ms       100.00%       6.494ms       2.165ms             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       4.899ms       100.03%       4.899ms       4.899ms             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       4.898ms       100.00%       4.898ms       1.633ms             3  
-                                Activity Buffer Request        22.36%       1.428ms        22.36%       1.428ms       1.428ms       1.596ms        32.59%       1.596ms       1.596ms             1  
-                                            aten::empty         0.49%      31.052us         0.49%      31.052us       3.450us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         0.41%      26.160us         0.41%      26.160us       8.720us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.06%       3.830us         0.06%       3.830us       0.638us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        74.60%       4.764ms        74.60%       4.764ms       4.764ms       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         1.19%      75.581us        25.55%       1.628ms       1.628ms       0.000us         0.00%       6.473ms       6.473ms             1  
+                                       aten::layer_norm         0.14%       9.142us        24.37%       1.553ms     517.550us       0.000us         0.00%       6.473ms       2.158ms             3  
+                                aten::native_layer_norm         0.81%      51.921us        24.22%       1.544ms     514.502us       4.881ms       100.00%       6.473ms       2.158ms             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       4.882ms       100.03%       4.882ms       4.882ms             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       4.881ms       100.00%       4.881ms       1.627ms             3  
+                                Activity Buffer Request        22.46%       1.431ms        22.46%       1.431ms       1.431ms       1.592ms        32.61%       1.592ms       1.592ms             1  
+                                            aten::empty         0.44%      27.841us         0.44%      27.841us       3.093us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         0.45%      28.910us         0.45%      28.910us       9.637us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.06%       3.829us         0.06%       3.829us       0.638us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        74.45%       4.743ms        74.45%       4.743ms       4.743ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 6.386ms
-Self CUDA time total: 4.898ms
+Self CPU time total: 6.372ms
+Self CUDA time total: 4.881ms
 
 
 
@@ -4011,19 +4012,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S4096_D4096
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         1.17%      72.893us        26.00%       1.616ms       1.616ms       0.000us         0.00%       6.248ms       6.248ms             1  
-                                       aten::layer_norm         0.15%       9.290us        24.82%       1.543ms     514.468us       0.000us         0.00%       6.248ms       2.083ms             3  
-                                aten::native_layer_norm         0.84%      52.403us        24.67%       1.534ms     511.371us       4.735ms       100.00%       6.248ms       2.083ms             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       4.736ms       100.03%       4.736ms       4.736ms             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       4.735ms       100.00%       4.735ms       1.578ms             3  
-                                Activity Buffer Request        22.86%       1.421ms        22.86%       1.421ms       1.421ms       1.513ms        31.96%       1.513ms       1.513ms             1  
-                                            aten::empty         0.47%      29.320us         0.47%      29.320us       3.258us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         0.43%      26.781us         0.43%      26.781us       8.927us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.07%       4.140us         0.07%       4.140us       0.690us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        74.00%       4.601ms        74.00%       4.601ms       4.601ms       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         1.15%      71.882us        26.71%       1.668ms       1.668ms       0.000us         0.00%       6.222ms       6.222ms             1  
+                                       aten::layer_norm         0.15%       9.629us        25.56%       1.596ms     532.153us       0.000us         0.00%       6.222ms       2.074ms             3  
+                                aten::native_layer_norm         0.90%      56.373us        25.41%       1.587ms     528.943us       4.717ms       100.00%       6.222ms       2.074ms             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       4.718ms       100.03%       4.718ms       4.718ms             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       4.717ms       100.00%       4.717ms       1.572ms             3  
+                                Activity Buffer Request        23.44%       1.464ms        23.44%       1.464ms       1.464ms       1.506ms        31.93%       1.506ms       1.506ms             1  
+                                            aten::empty         0.46%      28.850us         0.46%      28.850us       3.206us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         0.52%      32.781us         0.52%      32.781us      10.927us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.07%       4.590us         0.07%       4.590us       0.765us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        73.29%       4.577ms        73.29%       4.577ms       4.577ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 6.218ms
-Self CUDA time total: 4.735ms
+Self CPU time total: 6.246ms
+Self CUDA time total: 4.717ms
 
 
 
@@ -4033,19 +4034,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S4096_D8192
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         0.66%      74.633us        14.54%       1.650ms       1.650ms       0.000us         0.00%      13.090ms      13.090ms             1  
-                                       aten::layer_norm         0.09%       9.800us        13.88%       1.575ms     525.028us       0.000us         0.00%      13.090ms       4.363ms             3  
-                                aten::native_layer_norm         0.45%      51.390us        13.79%       1.565ms     521.762us       9.838ms       100.00%      13.090ms       4.363ms             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       9.839ms       100.01%       9.839ms       9.839ms             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       9.838ms       100.00%       9.838ms       3.279ms             3  
-                                Activity Buffer Request        11.36%       1.289ms        11.36%       1.289ms       1.289ms       3.253ms        33.06%       3.253ms       3.253ms             1  
-                                            aten::empty         0.28%      31.381us         0.28%      31.381us       3.487us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         1.67%     189.088us         1.67%     189.088us      63.029us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.04%       4.121us         0.04%       4.121us       0.687us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        85.46%       9.697ms        85.46%       9.697ms       9.697ms       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         0.67%      74.340us        13.35%       1.490ms       1.490ms       0.000us         0.00%      13.028ms      13.028ms             1  
+                                       aten::layer_norm         0.09%       9.510us        12.69%       1.416ms     471.835us       0.000us         0.00%      13.028ms       4.343ms             3  
+                                aten::native_layer_norm         0.47%      52.269us        12.60%       1.406ms     468.665us       9.808ms       100.00%      13.028ms       4.343ms             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       9.809ms       100.02%       9.809ms       9.809ms             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       9.808ms       100.00%       9.808ms       3.269ms             3  
+                                Activity Buffer Request         9.72%       1.085ms         9.72%       1.085ms       1.085ms       3.220ms        32.83%       3.220ms       3.220ms             1  
+                                            aten::empty         0.26%      29.181us         0.26%      29.181us       3.242us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         2.11%     235.817us         2.11%     235.817us      78.606us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.04%       4.022us         0.04%       4.022us       0.670us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        86.65%       9.669ms        86.65%       9.669ms       9.669ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 11.347ms
-Self CUDA time total: 9.838ms
+Self CPU time total: 11.159ms
+Self CUDA time total: 9.808ms
 
 
 impl                     wl                  p50(ms)  ok
@@ -4057,7 +4058,7 @@ torch_layer_norm         LN_B16_S4096_D8192     3.33  True
 <div class="uv-install-logs" id="uv-logs-benchmark">
 <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 <div class="uv-logs-content" style="display: none;">
-Installed 37 packages in 221ms
+Installed 37 packages in 222ms
 </div>
 </div>
 <div class="cell-artifacts">
diff --git a/layer_norm/results/artifacts/combine/latency.svg b/layer_norm/results/artifacts/combine/latency.svg
index 51fba97fb0809dfd942d52b9b34e8a096d515676..c17ece602ed5ebc325bf99b71237b08ca31fbe89 100644
--- a/layer_norm/results/artifacts/combine/latency.svg
+++ b/layer_norm/results/artifacts/combine/latency.svg
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1e41c135df9f0b506fa1ac950b90bd609d850f01d79b3171b3678c24fdab066a
-size 14645
+oid sha256:8fd53794c4617f7e947676c655de6f739b720b8f16a59432369c127bfc08190a
+size 14644
diff --git a/layer_norm/results/combined_results.html b/layer_norm/results/combined_results.html
index 616fba09e8126d17fe18ed8e4396c65eb84adaef..5a42e66a6787e88853b7090c03ba6d4a8cd04457 100644
--- a/layer_norm/results/combined_results.html
+++ b/layer_norm/results/combined_results.html
@@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
   <rdf:RDF>
    <ns2:Work>
     <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
-    <dc:date>2025-10-28T14:09:21.825978</dc:date>
+    <dc:date>2025-10-29T14:27:45.722521</dc:date>
     <dc:format>image/svg+xml</dc:format>
     <dc:creator>
      <ns2:Agent>
@@ -3956,70 +3956,70 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
    <g id="matplotlib.axis_2">
     <g id="ytick_1">
      <g id="grid-y--2" class="grid grid-y">
-      <path d="M 47.72 409.029804  L 840.20233 409.029804  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 408.957392  L 840.20233 408.957392  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_5">
       <defs>
        <path id="m0fca2865ba" d="M 0 0  L -3.5 0  " style="stroke: #000000; stroke-width: 0.8" />
       </defs>
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="409.029804" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="408.957392" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_5">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="412.829023" transform="rotate(-0 40.72 412.829023)">1.0</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="412.756611" transform="rotate(-0 40.72 412.756611)">1.0</text>
      </g>
     </g>
     <g id="ytick_2">
      <g id="grid-y--3" class="grid grid-y">
-      <path d="M 47.72 331.290271  L 840.20233 331.290271  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 331.05018  L 840.20233 331.05018  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_6">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="331.290271" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="331.05018" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_6">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="335.08949" transform="rotate(-0 40.72 335.08949)">1.5</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="334.849399" transform="rotate(-0 40.72 334.849399)">1.5</text>
      </g>
     </g>
     <g id="ytick_3">
      <g id="grid-y--4" class="grid grid-y">
-      <path d="M 47.72 253.550738  L 840.20233 253.550738  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 253.142969  L 840.20233 253.142969  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_7">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="253.550738" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="253.142969" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_7">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="257.349957" transform="rotate(-0 40.72 257.349957)">2.0</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="256.942188" transform="rotate(-0 40.72 256.942188)">2.0</text>
      </g>
     </g>
     <g id="ytick_4">
      <g id="grid-y--5" class="grid grid-y">
-      <path d="M 47.72 175.811205  L 840.20233 175.811205  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 175.235758  L 840.20233 175.235758  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_8">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="175.811205" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="175.235758" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_8">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="179.610424" transform="rotate(-0 40.72 179.610424)">2.5</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="179.034976" transform="rotate(-0 40.72 179.034976)">2.5</text>
      </g>
     </g>
     <g id="ytick_5">
      <g id="grid-y--6" class="grid grid-y">
-      <path d="M 47.72 98.071672  L 840.20233 98.071672  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 97.328546  L 840.20233 97.328546  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_9">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="98.071672" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="97.328546" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_9">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="101.870891" transform="rotate(-0 40.72 101.870891)">3.0</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="101.127765" transform="rotate(-0 40.72 101.127765)">3.0</text>
      </g>
     </g>
     <g id="label--y" class="ylabel">
@@ -4027,27 +4027,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
     </g>
    </g>
    <g id="series--torch-layer-norm" class="series">
-    <path d="M 83.741924 437.689571  L 323.888085 303.094453  L 564.034245 314.534914  L 804.180406 46.442361  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 83.741924 437.689571  L 323.888085 302.833591  L 564.034245 313.993176  L 804.180406 46.442361  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="md7efaf3aec" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #1f77b4" />
     </defs>
     <g clip-path="url(#p2214f54723)">
      <use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="323.888085" y="303.094453" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="564.034245" y="314.534914" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="323.888085" y="302.833591" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="564.034245" y="313.993176" style="fill: #1f77b4; stroke: #1f77b4" />
      <use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
     </g>
    </g>
    <g id="series--hf-kernels-layer-norm" class="series">
-    <path d="M 83.741924 434.514533  L 323.888085 307.713737  L 564.034245 307.460461  L 804.180406 56.028111  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 83.741924 434.434608  L 323.888085 307.690482  L 564.034245 307.408302  L 804.180406 57.182805  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="m9b8c54d372" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #ff7f0e" />
     </defs>
     <g clip-path="url(#p2214f54723)">
-     <use ns4:href="#m9b8c54d372" x="83.741924" y="434.514533" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="323.888085" y="307.713737" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="564.034245" y="307.460461" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="804.180406" y="56.028111" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="83.741924" y="434.434608" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="323.888085" y="307.690482" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="564.034245" y="307.408302" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="804.180406" y="57.182805" style="fill: #ff7f0e; stroke: #ff7f0e" />
     </g>
    </g>
    <g id="patch_3">
@@ -4105,7 +4105,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
 <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
 </span> | 
-Cell: combine | 4.25s
+Cell: combine | 4.21s
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
 <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -4195,7 +4195,7 @@ impl                     wl                  p50(ms)  ok
 hf_kernels_layer_norm    LN_B16_S2048_D4096     0.84  True
 hf_kernels_layer_norm    LN_B16_S2048_D8192     1.65  True
 hf_kernels_layer_norm    LN_B16_S4096_D4096     1.65  True
-hf_kernels_layer_norm    LN_B16_S4096_D8192     3.27  True
+hf_kernels_layer_norm    LN_B16_S4096_D8192     3.26  True
 torch_layer_norm         LN_B16_S2048_D4096     0.82  True
 torch_layer_norm         LN_B16_S2048_D8192     1.68  True
 torch_layer_norm         LN_B16_S4096_D4096     1.61  True
@@ -4219,7 +4219,7 @@ Implementations included:
 <div class="uv-install-logs" id="uv-logs-combine">
 <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 <div class="uv-logs-content" style="display: none;">
-Installed 37 packages in 219ms
+Installed 37 packages in 210ms
 </div>
 </div>
 <div class="cell-artifacts">
@@ -4232,7 +4232,7 @@ Installed 37 packages in 219ms
   <rdf:RDF>
    <ns2:Work>
     <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
-    <dc:date>2025-10-28T14:09:21.825978</dc:date>
+    <dc:date>2025-10-29T14:27:45.722521</dc:date>
     <dc:format>image/svg+xml</dc:format>
     <dc:creator>
      <ns2:Agent>
@@ -4316,70 +4316,70 @@ Installed 37 packages in 219ms
    <g id="matplotlib.axis_2">
     <g id="ytick_1">
      <g id="grid-y--2" class="grid grid-y">
-      <path d="M 47.72 409.029804  L 840.20233 409.029804  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 408.957392  L 840.20233 408.957392  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_5">
       <defs>
        <path id="m0fca2865ba" d="M 0 0  L -3.5 0  " style="stroke: #000000; stroke-width: 0.8" />
       </defs>
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="409.029804" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="408.957392" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_5">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="412.829023" transform="rotate(-0 40.72 412.829023)">1.0</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="412.756611" transform="rotate(-0 40.72 412.756611)">1.0</text>
      </g>
     </g>
     <g id="ytick_2">
      <g id="grid-y--3" class="grid grid-y">
-      <path d="M 47.72 331.290271  L 840.20233 331.290271  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 331.05018  L 840.20233 331.05018  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_6">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="331.290271" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="331.05018" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_6">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="335.08949" transform="rotate(-0 40.72 335.08949)">1.5</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="334.849399" transform="rotate(-0 40.72 334.849399)">1.5</text>
      </g>
     </g>
     <g id="ytick_3">
      <g id="grid-y--4" class="grid grid-y">
-      <path d="M 47.72 253.550738  L 840.20233 253.550738  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 253.142969  L 840.20233 253.142969  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_7">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="253.550738" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="253.142969" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_7">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="257.349957" transform="rotate(-0 40.72 257.349957)">2.0</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="256.942188" transform="rotate(-0 40.72 256.942188)">2.0</text>
      </g>
     </g>
     <g id="ytick_4">
      <g id="grid-y--5" class="grid grid-y">
-      <path d="M 47.72 175.811205  L 840.20233 175.811205  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 175.235758  L 840.20233 175.235758  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_8">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="175.811205" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="175.235758" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_8">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="179.610424" transform="rotate(-0 40.72 179.610424)">2.5</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="179.034976" transform="rotate(-0 40.72 179.034976)">2.5</text>
      </g>
     </g>
     <g id="ytick_5">
      <g id="grid-y--6" class="grid grid-y">
-      <path d="M 47.72 98.071672  L 840.20233 98.071672  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 97.328546  L 840.20233 97.328546  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_9">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="98.071672" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="97.328546" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_9">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="101.870891" transform="rotate(-0 40.72 101.870891)">3.0</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="101.127765" transform="rotate(-0 40.72 101.127765)">3.0</text>
      </g>
     </g>
     <g id="label--y" class="ylabel">
@@ -4387,27 +4387,27 @@ Installed 37 packages in 219ms
     </g>
    </g>
    <g id="series--torch-layer-norm" class="series">
-    <path d="M 83.741924 437.689571  L 323.888085 303.094453  L 564.034245 314.534914  L 804.180406 46.442361  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 83.741924 437.689571  L 323.888085 302.833591  L 564.034245 313.993176  L 804.180406 46.442361  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="md7efaf3aec" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #1f77b4" />
     </defs>
     <g clip-path="url(#p2214f54723)">
      <use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="323.888085" y="303.094453" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="564.034245" y="314.534914" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="323.888085" y="302.833591" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="564.034245" y="313.993176" style="fill: #1f77b4; stroke: #1f77b4" />
      <use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
     </g>
    </g>
    <g id="series--hf-kernels-layer-norm" class="series">
-    <path d="M 83.741924 434.514533  L 323.888085 307.713737  L 564.034245 307.460461  L 804.180406 56.028111  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 83.741924 434.434608  L 323.888085 307.690482  L 564.034245 307.408302  L 804.180406 57.182805  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="m9b8c54d372" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #ff7f0e" />
     </defs>
     <g clip-path="url(#p2214f54723)">
-     <use ns4:href="#m9b8c54d372" x="83.741924" y="434.514533" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="323.888085" y="307.713737" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="564.034245" y="307.460461" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="804.180406" y="56.028111" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="83.741924" y="434.434608" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="323.888085" y="307.690482" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="564.034245" y="307.408302" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="804.180406" y="57.182805" style="fill: #ff7f0e; stroke: #ff7f0e" />
     </g>
    </g>
    <g id="patch_3">
diff --git a/rotary/impls/artifacts/benchmark/rotary.jsonl b/rotary/impls/artifacts/benchmark/rotary.jsonl
index 18ed4f37499b08e63b86a43f9ee0bdc193375b0d..e407db0807eb78b1db05edcb765f594b555812aa 100644
--- a/rotary/impls/artifacts/benchmark/rotary.jsonl
+++ b/rotary/impls/artifacts/benchmark/rotary.jsonl
@@ -1,24 +1,24 @@
-{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H8_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.1724160000549091, "p50": 0.17308600001797458, "p90": 0.1756759999125279, "mean": 0.1760500000045795, "iqr": 0.0032199998258874984, "raw_times": [0.17245600008664042, 0.1756759999125279, 0.1724160000549091, 0.17308600001797458, 0.18661599995084543], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.17975699995531613, "peak_bytes": 1720320, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H8_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22642799990535423, "p50": 0.2294280000114668, "p90": 0.23093799995876907, "mean": 0.23135619996992318, "iqr": 0.0026599999500831473, "raw_times": [0.23093799995876907, 0.22642799990535423, 0.22827800000868592, 0.2417089999653399, 0.2294280000114668], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23494799995660287, "peak_bytes": 3440640, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H32_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21688800006813835, "p50": 0.21992799997860857, "p90": 0.2219079999576934, "mean": 0.22172000001319248, "iqr": 0.004439999884198187, "raw_times": [0.2174680000734952, 0.2219079999576934, 0.21688800006813835, 0.23240799998802686, 0.21992799997860857], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.225418000013633, "peak_bytes": 6832128, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H32_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21487700007583044, "p50": 0.21964699999443837, "p90": 0.22132800006602338, "mean": 0.21978760003094067, "iqr": 0.005100000066704524, "raw_times": [0.21487700007583044, 0.21622799999931885, 0.21964699999443837, 0.22132800006602338, 0.2268580000190923], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.24882799993974913, "peak_bytes": 13664256, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H8_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21332699998311, "p50": 0.21615699995436444, "p90": 0.21744800005762954, "mean": 0.21590960000139603, "iqr": 0.0025000000505315256, "raw_times": [0.21332699998311, 0.21744800005762954, 0.21494800000709802, 0.21766800000477815, 0.21615699995436444], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22001800005000405, "peak_bytes": 6881280, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H8_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21038799991401902, "p50": 0.21561700009442575, "p90": 0.21720800009461527, "mean": 0.22098599999935686, "iqr": 0.004100000182916119, "raw_times": [0.21038799991401902, 0.21720800009461527, 0.21561700009442575, 0.24860899998202513, 0.21310799991169915], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2178580000418151, "peak_bytes": 13762560, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H32_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21404700009952649, "p50": 0.21557699994900759, "p90": 0.2158679999411106, "mean": 0.2152116000161186, "iqr": 0.0011999999287581886, "raw_times": [0.2158679999411106, 0.21589800007859594, 0.21404700009952649, 0.21466800001235242, 0.21557699994900759], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21567799990407366, "peak_bytes": 27328512, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H32_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21701799994389148, "p50": 0.21822700000484474, "p90": 0.22002800005793688, "mean": 0.2237478000097326, "iqr": 0.002031000008173578, "raw_times": [0.22002800005793688, 0.2179970000497633, 0.2454689999922266, 0.21822700000484474, 0.21701799994389148], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22291799996310147, "peak_bytes": 54657024, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H8_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21712800003115262, "p50": 0.21885700004986575, "p90": 0.2196080000658185, "mean": 0.22401780001928273, "iqr": 0.001630000042496249, "raw_times": [0.21797800002332224, 0.2196080000658185, 0.24651799992625456, 0.21885700004986575, 0.21712800003115262], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2206780000051367, "peak_bytes": 27525120, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H8_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21462800009430794, "p50": 0.21782799990432977, "p90": 0.21795700001803198, "mean": 0.21911359999648994, "iqr": 0.0030300000162242213, "raw_times": [0.21462800009430794, 0.23022799996397225, 0.21782799990432977, 0.21492700000180776, 0.21795700001803198], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2186980000260519, "peak_bytes": 55050240, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H32_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21579799999926763, "p50": 0.21701699995446688, "p90": 0.22130799993647088, "mean": 0.2237457999626713, "iqr": 0.004450000005817856, "raw_times": [0.21701699995446688, 0.22130799993647088, 0.21579799999926763, 0.24774799999249808, 0.21685799993065302], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22235700009787251, "peak_bytes": 109314048, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H32_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22434800007431477, "p50": 0.2248280000003433, "p90": 0.22490799995011912, "mean": 0.22479799997654482, "iqr": 0.00031000001854408765, "raw_times": [0.2248280000003433, 0.22490799995011912, 0.22459799993157503, 0.22434800007431477, 0.22530799992637185], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23522799995134847, "peak_bytes": 218628096, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H8_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21574699997017888, "p50": 0.21802799994929956, "p90": 0.21904799996264046, "mean": 0.22033179998288688, "iqr": 0.0018999999156221747, "raw_times": [0.21714800004701829, 0.21802799994929956, 0.2316879999852972, 0.21904799996264046, 0.21574699997017888], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22168800001054478, "peak_bytes": 68698112, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H8_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21753800001533818, "p50": 0.21888800006308884, "p90": 0.22129700005280029, "mean": 0.22190180004599824, "iqr": 0.003358999947522534, "raw_times": [0.21753800001533818, 0.23384799999348616, 0.21793800010527775, 0.21888800006308884, 0.22129700005280029], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22266799999215436, "peak_bytes": 6848512, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H32_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2166670000178783, "p50": 0.21850699999959033, "p90": 0.21964699999443837, "mean": 0.21864339998955984, "iqr": 0.001419000000169035, "raw_times": [0.21850699999959033, 0.2166670000178783, 0.22016799994162284, 0.21822799999426934, 0.21964699999443837], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23552799996195972, "peak_bytes": 13647872, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H32_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21624800001518452, "p50": 0.21773700007088337, "p90": 0.21802799994929956, "mean": 0.21774760000425886, "iqr": 0.0013409999155555852, "raw_times": [0.21668700003374397, 0.21773700007088337, 0.22003799995218287, 0.21624800001518452, 0.21802799994929956], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2226780000000872, "peak_bytes": 27295744, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H8_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21748699998624943, "p50": 0.22014700005001941, "p90": 0.22206799997093185, "mean": 0.22232159999475698, "iqr": 0.0019999999949504854, "raw_times": [0.22014700005001941, 0.23183799999060284, 0.22206799997093185, 0.21748699998624943, 0.22006799997598137], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22040800001832395, "peak_bytes": 13697024, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H8_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21601800006010308, "p50": 0.21957800004202, "p90": 0.22023799999715266, "mean": 0.2213318000030995, "iqr": 0.0024510000002919696, "raw_times": [0.23303799991936103, 0.21601800006010308, 0.2177869999968607, 0.21957800004202, 0.22023799999715266], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.220787999978711, "peak_bytes": 27394048, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H32_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21692799998618284, "p50": 0.22003699996275827, "p90": 0.2230679999684071, "mean": 0.222287800011145, "iqr": 0.0031599998919773498, "raw_times": [0.21692799998618284, 0.21990800007642974, 0.2314980000619471, 0.2230679999684071, 0.22003699996275827], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22102700006598752, "peak_bytes": 54591488, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H32_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2160679999860804, "p50": 0.21972700005790102, "p90": 0.22029800004474964, "mean": 0.21970960001453932, "iqr": 0.0024610000082248007, "raw_times": [0.2160679999860804, 0.2246179999474407, 0.22029800004474964, 0.21972700005790102, 0.21783700003652484], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22191799996562622, "peak_bytes": 109182976, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H8_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2172279999967941, "p50": 0.21847799996521644, "p90": 0.22105800007921061, "mean": 0.22193580000475777, "iqr": 0.0035110000453641987, "raw_times": [0.21847799996521644, 0.22105800007921061, 0.23536799994872126, 0.21754700003384642, 0.2172279999967941], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22206799997093185, "peak_bytes": 54788096, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H8_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21436800000174117, "p50": 0.21785799992812827, "p90": 0.2195579999124675, "mean": 0.2202379999744153, "iqr": 0.0030299999025373836, "raw_times": [0.21436800000174117, 0.21785799992812827, 0.2195579999124675, 0.2165280000099301, 0.2328780000198094], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.25353900002755836, "peak_bytes": 109576192, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H32_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22968799999034673, "p50": 0.23015800002212927, "p90": 0.23064800006977748, "mean": 0.23369620002995362, "iqr": 0.0006600000688194996, "raw_times": [0.23015800002212927, 0.24799900006655662, 0.22968799999034673, 0.22998800000095798, 0.23064800006977748], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23042800000894204, "peak_bytes": 218365952, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H32_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.6347319999804313, "p50": 0.6375930000785957, "p90": 0.639283000055002, "mean": 0.6376124000325945, "iqr": 0.003270999968663091, "raw_times": [0.6375930000785957, 0.636012000086339, 0.6404419999626043, 0.6347319999804313, 0.639283000055002], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.639422999938688, "peak_bytes": 436731904, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T14:27:29Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H8_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.17006399997399058, "p50": 0.17533400000502297, "p90": 0.1853339999797754, "mean": 0.1802961999942454, "iqr": 0.014799999974002276, "raw_times": [0.17533400000502297, 0.20021500000666492, 0.17053400000577312, 0.1853339999797754, 0.17006399997399058], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.18331500001522727, "peak_bytes": 1720320, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T14:27:29Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H8_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2246159999685915, "p50": 0.2266160000203854, "p90": 0.22888500001272405, "mean": 0.22735560000910482, "iqr": 0.002880000010918593, "raw_times": [0.22600500000180546, 0.22888500001272405, 0.2246159999685915, 0.2266160000203854, 0.2306560000420177], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2471160000254713, "peak_bytes": 3440640, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T14:27:29Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H32_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21813499995460006, "p50": 0.22189599997091136, "p90": 0.2272149999953399, "mean": 0.22315939997952228, "iqr": 0.007960000004914036, "raw_times": [0.2272149999953399, 0.22189599997091136, 0.21925499999042586, 0.21813499995460006, 0.2292959999863342], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2391049999914685, "peak_bytes": 6832128, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T14:27:29Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H32_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21478600001501036, "p50": 0.21544499998071842, "p90": 0.2178249999928994, "mean": 0.2161891999890031, "iqr": 0.0027799999884337012, "raw_times": [0.21544499998071842, 0.2178249999928994, 0.21478600001501036, 0.21784499995192164, 0.2150450000044657], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22814599998355334, "peak_bytes": 13664256, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T14:27:29Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H8_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2160950000416051, "p50": 0.22390499998437008, "p90": 0.22473600000694205, "mean": 0.22559540000202105, "iqr": 0.00507100003233063, "raw_times": [0.22390499998437008, 0.24357600000257662, 0.22473600000694205, 0.2160950000416051, 0.21966499997461142], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.229085999990275, "peak_bytes": 6881280, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H8_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21366499998975996, "p50": 0.21597500000325454, "p90": 0.21670500001391702, "mean": 0.2158129999884295, "iqr": 0.0008600000569458643, "raw_times": [0.21366499998975996, 0.2168749999782449, 0.21597500000325454, 0.21670500001391702, 0.21584499995697115], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21572499997546402, "peak_bytes": 13762560, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H32_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2142449999951168, "p50": 0.21574499999132968, "p90": 0.2169850000086626, "mean": 0.21585539999477987, "iqr": 0.0022990000161371427, "raw_times": [0.2142449999951168, 0.2176159999862648, 0.21468599999252547, 0.2169850000086626, 0.21574499999132968], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2192349999745602, "peak_bytes": 27328512, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H32_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21675499999673775, "p50": 0.21711599998752718, "p90": 0.21833499999956985, "mean": 0.2174776000060774, "iqr": 0.0015789999565640755, "raw_times": [0.21675499999673775, 0.21711599998752718, 0.21833499999956985, 0.21675600004300577, 0.2184260000035465], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22064600000248902, "peak_bytes": 54657024, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H8_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2153649999740992, "p50": 0.21702599997297511, "p90": 0.21829499996783852, "mean": 0.21729759998834197, "iqr": 0.0014989999499448459, "raw_times": [0.2153649999740992, 0.21679600001789368, 0.21900600000890336, 0.21702599997297511, 0.21829499996783852], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22154499998805477, "peak_bytes": 27525120, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H8_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2143060000321384, "p50": 0.2161449999675824, "p90": 0.21640500000330576, "mean": 0.21578740000904872, "iqr": 0.0008589999538344273, "raw_times": [0.21653499999274572, 0.21640500000330576, 0.2143060000321384, 0.2161449999675824, 0.21554600004947133], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23455599995259035, "peak_bytes": 55050240, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H32_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21488499999122723, "p50": 0.21633500000461936, "p90": 0.21918499999173946, "mean": 0.21730919999072285, "iqr": 0.004300000000512227, "raw_times": [0.21488499999122723, 0.21918499999173946, 0.21488499999122723, 0.22125599997480094, 0.21633500000461936], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2185359999771208, "peak_bytes": 109314048, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H32_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2230359999657594, "p50": 0.22526600002947816, "p90": 0.22695600000588456, "mean": 0.22723160000168718, "iqr": 0.0026509999884183344, "raw_times": [0.22526600002947816, 0.2230359999657594, 0.23659499998984757, 0.22430500001746623, 0.22695600000588456], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22456599998577076, "peak_bytes": 218628096, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H8_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21608499997682884, "p50": 0.2175149999743553, "p90": 0.22948600002337116, "mean": 0.2247094000040306, "iqr": 0.012610999988282856, "raw_times": [0.21608499997682884, 0.2168750000350883, 0.22948600002337116, 0.24358600001050945, 0.2175149999743553], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21851499997183055, "peak_bytes": 68698112, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H8_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2162149999662688, "p50": 0.21694499997693129, "p90": 0.2171250000060354, "mean": 0.21706100000074002, "iqr": 0.0003099999617006688, "raw_times": [0.2162149999662688, 0.21694499997693129, 0.2171250000060354, 0.21820500001012988, 0.21681500004433474], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21809500003655558, "peak_bytes": 6848512, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H32_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2136749999976928, "p50": 0.21658500003240988, "p90": 0.21662599999672238, "mean": 0.21621120000645533, "iqr": 0.00066100000140068, "raw_times": [0.2136749999976928, 0.2159649999953217, 0.21658500003240988, 0.21662599999672238, 0.21820500001012988], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2181750000431748, "peak_bytes": 13647872, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H32_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21695499998486412, "p50": 0.21774499998628016, "p90": 0.2285450000272249, "mean": 0.22256720000086716, "iqr": 0.010920000022451859, "raw_times": [0.21774499998628016, 0.21762500000477303, 0.2319660000011936, 0.21695499998486412, 0.2285450000272249], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22269599998026024, "peak_bytes": 27295744, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H8_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21440599999777987, "p50": 0.21785499995985447, "p90": 0.2335159999802272, "mean": 0.2228595999895333, "iqr": 0.01891099998374557, "raw_times": [0.21440599999777987, 0.23391600001332336, 0.21785499995985447, 0.2335159999802272, 0.21460499999648164], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21932499998911226, "peak_bytes": 13697024, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H8_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2133250000042608, "p50": 0.21422499997925115, "p90": 0.21653499999274572, "mean": 0.21708740000576654, "iqr": 0.0029589999712698045, "raw_times": [0.21357600002147592, 0.2133250000042608, 0.21653499999274572, 0.21422499997925115, 0.2277760000310991], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22739600001386862, "peak_bytes": 27394048, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H32_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21597500000325454, "p50": 0.2176859999849512, "p90": 0.21771499996248167, "mean": 0.21758339998996234, "iqr": 0.0013999999737279722, "raw_times": [0.2176859999849512, 0.21771499996248167, 0.22022600001037063, 0.2163149999887537, 0.21597500000325454], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21613599994907418, "peak_bytes": 54591488, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H32_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21577600000455277, "p50": 0.2173749999769825, "p90": 0.21900600000890336, "mean": 0.21836960000882755, "iqr": 0.0018509999790694565, "raw_times": [0.2171550000298339, 0.21577600000455277, 0.2225360000238652, 0.2173749999769825, 0.21900600000890336], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22321599999486352, "peak_bytes": 109182976, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H8_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21538499998996485, "p50": 0.21647599999141676, "p90": 0.21717500004569956, "mean": 0.2167214000110107, "iqr": 0.001030000021273736, "raw_times": [0.21717500004569956, 0.2184260000035465, 0.21538499998996485, 0.21614500002442583, 0.21647599999141676], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21872600001415776, "peak_bytes": 54788096, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H8_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21501500003751062, "p50": 0.2168760000245129, "p90": 0.2187050000088675, "mean": 0.21949320001795058, "iqr": 0.0030500000320898835, "raw_times": [0.21565499997677762, 0.2187050000088675, 0.21501500003751062, 0.23121500004208428, 0.2168760000245129], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22076499999457155, "peak_bytes": 109576192, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H32_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22937599999295344, "p50": 0.23008499999832566, "p90": 0.23144499999716572, "mean": 0.23359140000138723, "iqr": 0.0020100000028833165, "raw_times": [0.23144499999716572, 0.23008499999832566, 0.2294349999942824, 0.22937599999295344, 0.24761600002420892], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23195599999326078, "peak_bytes": 218365952, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H32_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.637245999996594, "p50": 0.6388759999822469, "p90": 0.6389449999915087, "mean": 0.6396317999929124, "iqr": 0.0012190000120426703, "raw_times": [0.6388759999822469, 0.6453660000147465, 0.6389449999915087, 0.637245999996594, 0.637725999979466], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.6364359999793123, "peak_bytes": 436731904, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
diff --git a/rotary/impls/hf_kernels_rotary.html b/rotary/impls/hf_kernels_rotary.html
index 2749f9f6b5f352621fbf7d1a4c5db169ca775615..0608b9088d0d84399b39661fd8d9fc01a39dbda5 100644
--- a/rotary/impls/hf_kernels_rotary.html
+++ b/rotary/impls/hf_kernels_rotary.html
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
 <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> | 
-Cell: nv | 0.23s
+Cell: nv | 0.20s
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
 <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3887,7 +3887,7 @@ Cell: nv | 0.23s
 </div>
 </div>
 <div id="output-nv" class="cell-output">
-<div class="cell-stdout"><pre class="stdout-text">Tue Oct 28 14:08:24 2025       
+<div class="cell-stdout"><pre class="stdout-text">Wed Oct 29 14:26:51 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -3896,7 +3896,7 @@ Cell: nv | 0.23s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   29C    P0             90W /  350W |       0MiB /  46068MiB |     24%      Default |
+| N/A   32C    P0             76W /  350W |       0MiB /  46068MiB |     11%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3920,7 +3920,7 @@ Cell: nv | 0.23s
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
 </span> | 
-Cell: benchmark | 8.05s
+Cell: benchmark | 7.90s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3989,23 +3989,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     452.802us      1907.02%     452.802us     452.802us             1  
-                                      hf_kernels_rotary        12.50%     264.332us        99.65%       2.107ms       2.107ms       0.000us         0.00%      24.960us      24.960us             1  
-                          _rotary_dba7d1e::apply_rotary         2.70%      57.162us         4.91%     103.733us      17.289us      16.928us        71.29%      16.928us       2.821us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.928us        71.29%      16.928us       2.821us             6  
-                                            aten::clone         2.21%      46.761us        79.27%       1.676ms     279.401us       0.000us         0.00%       8.032us       1.339us             6  
-                                            aten::copy_         2.31%      48.833us        74.02%       1.565ms     260.899us       6.816us        28.71%       8.032us       1.339us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.816us        28.71%       6.816us       1.136us             6  
-                                Activity Buffer Request        68.03%       1.439ms        68.03%       1.439ms       1.439ms       1.216us         5.12%       1.216us       1.216us             1  
-                                    aten::empty_strided         3.04%      64.252us         3.04%      64.252us      10.709us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         3.68%      77.892us         3.68%      77.892us      12.982us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.33%      49.309us         2.97%      62.771us       5.231us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.64%      13.462us         0.64%      13.462us       1.122us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         2.20%      46.571us         2.20%      46.571us       7.762us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.35%       7.480us         0.35%       7.480us       7.480us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     433.056us      1833.74%     433.056us     433.056us             1  
+                                      hf_kernels_rotary        12.39%     257.808us        99.67%       2.073ms       2.073ms       0.000us         0.00%      24.832us      24.832us             1  
+                          _rotary_dba7d1e::apply_rotary         2.75%      57.199us         5.11%     106.332us      17.722us      16.960us        71.82%      16.960us       2.827us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.960us        71.82%      16.960us       2.827us             6  
+                                            aten::clone         2.11%      43.871us        79.26%       1.649ms     274.763us       0.000us         0.00%       7.872us       1.312us             6  
+                                            aten::copy_         2.19%      45.572us        74.13%       1.542ms     256.978us       6.656us        28.18%       7.872us       1.312us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.656us        28.18%       6.656us       1.109us             6  
+                                Activity Buffer Request        68.36%       1.422ms        68.36%       1.422ms       1.422ms       1.216us         5.15%       1.216us       1.216us             1  
+                                    aten::empty_strided         3.02%      62.841us         3.02%      62.841us      10.473us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         3.58%      74.452us         3.58%      74.452us      12.409us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.28%      47.469us         2.90%      60.410us       5.034us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.62%      12.941us         0.62%      12.941us       1.078us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.36%      49.133us         2.36%      49.133us       8.189us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.33%       6.850us         0.33%       6.850us       6.850us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.115ms
-Self CUDA time total: 23.744us
+Self CPU time total: 2.080ms
+Self CUDA time total: 23.616us
 
 
 
@@ -4015,23 +4015,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     357.532us      1513.94%     357.532us     357.532us             1  
-                                      hf_kernels_rotary         9.61%     183.785us        99.72%       1.907ms       1.907ms       0.000us         0.00%      24.736us      24.736us             1  
-                          _rotary_dba7d1e::apply_rotary         2.38%      45.511us         4.57%      87.364us      14.561us      16.832us        71.27%      16.832us       2.805us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.832us        71.27%      16.832us       2.805us             6  
-                                            aten::clone         1.27%      24.322us        83.40%       1.595ms     265.794us       0.000us         0.00%       7.904us       1.317us             6  
-                                            aten::copy_         1.98%      37.831us        80.39%       1.537ms     256.202us       6.784us        28.73%       7.904us       1.317us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.784us        28.73%       6.784us       1.131us             6  
-                                Activity Buffer Request        75.51%       1.444ms        75.51%       1.444ms       1.444ms       1.120us         4.74%       1.120us       1.120us             1  
-                                    aten::empty_strided         1.74%      33.230us         1.74%      33.230us       5.538us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         2.90%      55.533us         2.90%      55.533us       9.256us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.68%      32.211us         2.13%      40.791us       3.399us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.45%       8.580us         0.45%       8.580us       0.715us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         2.19%      41.853us         2.19%      41.853us       6.976us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.28%       5.420us         0.28%       5.420us       5.420us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     368.319us      1559.68%     368.319us     368.319us             1  
+                                      hf_kernels_rotary         8.92%     167.782us        99.73%       1.876ms       1.876ms       0.000us         0.00%      24.767us      24.767us             1  
+                          _rotary_dba7d1e::apply_rotary         2.34%      44.032us         4.50%      84.553us      14.092us      16.832us        71.28%      16.832us       2.805us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.832us        71.28%      16.832us       2.805us             6  
+                                            aten::clone         1.16%      21.840us        83.94%       1.579ms     263.113us       0.000us         0.00%       7.935us       1.322us             6  
+                                            aten::copy_         2.86%      53.852us        81.07%       1.525ms     254.111us       6.783us        28.72%       7.935us       1.322us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.783us        28.72%       6.783us       1.130us             6  
+                                Activity Buffer Request        75.10%       1.412ms        75.10%       1.412ms       1.412ms       1.152us         4.88%       1.152us       1.152us             1  
+                                    aten::empty_strided         1.71%      32.171us         1.71%      32.171us       5.362us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         3.11%      58.461us         3.11%      58.461us       9.744us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.82%      34.274us         2.37%      44.512us       3.709us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.54%      10.238us         0.54%      10.238us       0.853us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.15%      40.521us         2.15%      40.521us       6.753us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.27%       5.140us         0.27%       5.140us       5.140us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.912ms
-Self CUDA time total: 23.616us
+Self CPU time total: 1.881ms
+Self CUDA time total: 23.615us
 
 
 
@@ -4041,23 +4041,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     340.570us      1359.24%     340.570us     340.570us             1  
-                                      hf_kernels_rotary         8.83%     169.069us        99.74%       1.910ms       1.910ms       0.000us         0.00%      26.368us      26.368us             1  
-                          _rotary_dba7d1e::apply_rotary         2.33%      44.610us         4.50%      86.120us      14.353us      17.248us        68.84%      17.248us       2.875us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      17.248us        68.84%      17.248us       2.875us             6  
-                                            aten::clone         1.25%      23.991us        84.27%       1.614ms     269.024us       0.000us         0.00%       9.120us       1.520us             6  
-                                            aten::copy_         1.92%      36.791us        81.38%       1.559ms     259.779us       7.808us        31.16%       9.120us       1.520us             6  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     346.939us      1384.60%     346.939us     346.939us             1  
+                                      hf_kernels_rotary         8.57%     160.653us        99.71%       1.870ms       1.870ms       0.000us         0.00%      26.369us      26.369us             1  
+                          _rotary_dba7d1e::apply_rotary         2.32%      43.421us         4.67%      87.601us      14.600us      17.249us        68.84%      17.249us       2.875us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      17.249us        68.84%      17.249us       2.875us             6  
+                                            aten::clone         1.23%      23.032us        84.13%       1.577ms     262.912us       0.000us         0.00%       9.120us       1.520us             6  
+                                            aten::copy_         1.94%      36.311us        81.17%       1.522ms     253.669us       7.808us        31.16%       9.120us       1.520us             6  
                          Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.808us        31.16%       7.808us       1.301us             6  
-                                Activity Buffer Request        76.60%       1.467ms        76.60%       1.467ms       1.467ms       1.312us         5.24%       1.312us       1.312us             1  
-                                    aten::empty_strided         1.64%      31.482us         1.64%      31.482us       5.247us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         2.85%      54.600us         2.85%      54.600us       9.100us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.69%      32.440us         2.15%      41.092us       3.424us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.45%       8.652us         0.45%       8.652us       0.721us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         2.17%      41.510us         2.17%      41.510us       6.918us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.26%       4.990us         0.26%       4.990us       4.990us       0.000us         0.00%       0.000us       0.000us             1  
+                                Activity Buffer Request        76.42%       1.433ms        76.42%       1.433ms       1.433ms       1.312us         5.24%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.73%      32.420us         1.73%      32.420us       5.403us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         2.81%      52.730us         2.81%      52.730us       8.788us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.83%      34.233us         2.34%      43.964us       3.664us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.52%       9.731us         0.52%       9.731us       0.811us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.36%      44.180us         2.36%      44.180us       7.363us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.29%       5.410us         0.29%       5.410us       5.410us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.915ms
-Self CUDA time total: 25.056us
+Self CPU time total: 1.875ms
+Self CUDA time total: 25.057us
 
 
 
@@ -4067,23 +4067,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     346.075us      1340.08%     346.075us     346.075us             1  
-                                      hf_kernels_rotary         7.97%     168.270us        99.76%       2.107ms       2.107ms       0.000us         0.00%      27.137us      27.137us             1  
-                          _rotary_dba7d1e::apply_rotary         2.16%      45.651us         4.14%      87.411us      14.569us      18.049us        69.89%      18.049us       3.008us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      18.049us        69.89%      18.049us       3.008us             6  
-                                            aten::clone         1.15%      24.271us        85.69%       1.810ms     301.630us       0.000us         0.00%       9.088us       1.515us             6  
-                                            aten::copy_         1.78%      37.581us        83.02%       1.753ms     292.225us       7.776us        30.11%       9.088us       1.515us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.776us        30.11%       7.776us       1.296us             6  
-                                Activity Buffer Request        68.60%       1.449ms        68.60%       1.449ms       1.449ms       1.312us         5.08%       1.312us       1.312us             1  
-                                    aten::empty_strided         1.52%      32.162us         1.52%      32.162us       5.360us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        12.64%     267.018us        12.64%     267.018us      44.503us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.55%      32.701us         1.96%      41.360us       3.447us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.41%       8.659us         0.41%       8.659us       0.722us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         1.98%      41.760us         1.98%      41.760us       6.960us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.24%       5.141us         0.24%       5.141us       5.141us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     347.904us      1355.61%     347.904us     347.904us             1  
+                                      hf_kernels_rotary         7.92%     162.592us        99.76%       2.047ms       2.047ms       0.000us         0.00%      27.009us      27.009us             1  
+                          _rotary_dba7d1e::apply_rotary         2.09%      42.932us         4.15%      85.134us      14.189us      17.951us        69.95%      17.951us       2.992us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      17.951us        69.95%      17.951us       2.992us             6  
+                                            aten::clone         1.22%      25.009us        85.61%       1.757ms     292.750us       0.000us         0.00%       9.058us       1.510us             6  
+                                            aten::copy_         1.81%      37.091us        82.80%       1.699ms     283.112us       7.713us        30.05%       9.058us       1.510us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.713us        30.05%       7.713us       1.285us             6  
+                                Activity Buffer Request        69.84%       1.433ms        69.84%       1.433ms       1.433ms       1.345us         5.24%       1.345us       1.345us             1  
+                                    aten::empty_strided         1.60%      32.820us         1.60%      32.820us       5.470us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        11.14%     228.627us        11.14%     228.627us      38.104us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.59%      32.701us         2.07%      42.551us       3.546us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.48%       9.850us         0.48%       9.850us       0.821us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.06%      42.202us         2.06%      42.202us       7.034us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.24%       4.861us         0.24%       4.861us       4.861us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.112ms
-Self CUDA time total: 25.825us
+Self CPU time total: 2.052ms
+Self CUDA time total: 25.664us
 
 
 
@@ -4093,23 +4093,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     383.355us      1524.21%     383.355us     383.355us             1  
-                                      hf_kernels_rotary         8.48%     177.428us        99.77%       2.088ms       2.088ms       0.000us         0.00%      26.495us      26.495us             1  
-                          _rotary_dba7d1e::apply_rotary         3.05%      63.861us         5.13%     107.442us      17.907us      17.215us        68.45%      17.215us       2.869us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      17.215us        68.45%      17.215us       2.869us             6  
-                                            aten::clone         1.13%      23.688us        84.02%       1.758ms     293.025us       0.000us         0.00%       9.280us       1.547us             6  
-                                            aten::copy_         1.90%      39.711us        81.30%       1.701ms     283.530us       7.936us        31.55%       9.280us       1.547us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.936us        31.55%       7.936us       1.323us             6  
-                                Activity Buffer Request        67.53%       1.413ms        67.53%       1.413ms       1.413ms       1.344us         5.34%       1.344us       1.344us             1  
-                                    aten::empty_strided         1.59%      33.283us         1.59%      33.283us       5.547us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        11.87%     248.348us        11.87%     248.348us      41.391us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.70%      35.532us         2.14%      44.714us       3.726us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.44%       9.182us         0.44%       9.182us       0.765us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         2.08%      43.581us         2.08%      43.581us       7.264us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.23%       4.831us         0.23%       4.831us       4.831us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     356.192us      1425.17%     356.192us     356.192us             1  
+                                      hf_kernels_rotary         9.03%     181.778us        99.74%       2.009ms       2.009ms       0.000us         0.00%      26.306us      26.306us             1  
+                          _rotary_dba7d1e::apply_rotary         2.18%      43.970us         4.25%      85.660us      14.277us      17.088us        68.37%      17.088us       2.848us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      17.088us        68.37%      17.088us       2.848us             6  
+                                            aten::clone         1.16%      23.451us        84.31%       1.698ms     283.035us       0.000us         0.00%       9.218us       1.536us             6  
+                                            aten::copy_         1.79%      36.151us        81.55%       1.643ms     273.753us       7.905us        31.63%       9.218us       1.536us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.905us        31.63%       7.905us       1.318us             6  
+                                Activity Buffer Request        70.14%       1.413ms        70.14%       1.413ms       1.413ms       1.313us         5.25%       1.313us       1.313us             1  
+                                    aten::empty_strided         1.60%      32.242us         1.60%      32.242us       5.374us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         9.61%     193.593us         9.61%     193.593us      32.266us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.67%      33.621us         2.15%      43.371us       3.614us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.48%       9.750us         0.48%       9.750us       0.812us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.07%      41.690us         2.07%      41.690us       6.948us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.26%       5.140us         0.26%       5.140us       5.140us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.093ms
-Self CUDA time total: 25.151us
+Self CPU time total: 2.014ms
+Self CUDA time total: 24.993us
 
 
 
@@ -4119,23 +4119,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     348.288us      1348.70%     348.288us     348.288us             1  
-                                      hf_kernels_rotary         8.04%     167.026us        99.77%       2.072ms       2.072ms       0.000us         0.00%      27.136us      27.136us             1  
-                          _rotary_dba7d1e::apply_rotary         2.17%      45.031us         4.15%      86.212us      14.369us      18.016us        69.76%      18.016us       3.003us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      18.016us        69.76%      18.016us       3.003us             6  
-                                            aten::clone         1.23%      25.613us        85.56%       1.777ms     296.124us       0.000us         0.00%       9.120us       1.520us             6  
-                                            aten::copy_         1.80%      37.380us        82.71%       1.718ms     286.270us       7.808us        30.24%       9.120us       1.520us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.808us        30.24%       7.808us       1.301us             6  
-                                Activity Buffer Request        69.08%       1.434ms        69.08%       1.434ms       1.434ms       1.312us         5.08%       1.312us       1.312us             1  
-                                    aten::empty_strided         1.61%      33.511us         1.61%      33.511us       5.585us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        11.83%     245.758us        11.83%     245.758us      40.960us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.59%      33.022us         2.01%      41.843us       3.487us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.42%       8.821us         0.42%       8.821us       0.735us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         1.98%      41.181us         1.98%      41.181us       6.863us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.23%       4.770us         0.23%       4.770us       4.770us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     345.469us      1341.21%     345.469us     345.469us             1  
+                                      hf_kernels_rotary         8.14%     161.605us        99.74%       1.979ms       1.979ms       0.000us         0.00%      27.070us      27.070us             1  
+                          _rotary_dba7d1e::apply_rotary         2.10%      41.690us         4.19%      83.112us      13.852us      17.982us        69.81%      17.982us       2.997us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      17.982us        69.81%      17.982us       2.997us             6  
+                                            aten::clone         1.15%      22.842us        85.12%       1.689ms     281.515us       0.000us         0.00%       9.088us       1.515us             6  
+                                            aten::copy_         1.84%      36.466us        82.36%       1.634ms     272.405us       7.776us        30.19%       9.088us       1.515us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.776us        30.19%       7.776us       1.296us             6  
+                                Activity Buffer Request        71.40%       1.417ms        71.40%       1.417ms       1.417ms       1.312us         5.09%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.60%      31.821us         1.60%      31.821us       5.303us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         9.12%     181.057us         9.12%     181.057us      30.176us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.80%      35.740us         2.29%      45.520us       3.793us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.49%       9.780us         0.49%       9.780us       0.815us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.09%      41.422us         2.09%      41.422us       6.904us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.26%       5.151us         0.26%       5.151us       5.151us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.077ms
-Self CUDA time total: 25.824us
+Self CPU time total: 1.984ms
+Self CUDA time total: 25.758us
 
 
 
@@ -4145,23 +4145,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     342.589us      1056.85%     342.589us     342.589us             1  
-                                      hf_kernels_rotary         8.06%     166.005us        99.77%       2.055ms       2.055ms       0.000us         0.00%      34.208us      34.208us             1  
-                          _rotary_dba7d1e::apply_rotary         2.10%      43.163us         4.03%      82.914us      13.819us      21.856us        67.42%      21.856us       3.643us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      21.856us        67.42%      21.856us       3.643us             6  
-                                            aten::clone         1.18%      24.311us        85.73%       1.766ms     294.310us       0.000us         0.00%      12.352us       2.059us             6  
-                                            aten::copy_         1.85%      38.151us        82.92%       1.708ms     284.677us      10.560us        32.58%      12.352us       2.059us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.560us        32.58%      10.560us       1.760us             6  
-                                Activity Buffer Request        69.37%       1.429ms        69.37%       1.429ms       1.429ms       1.792us         5.53%       1.792us       1.792us             1  
-                                    aten::empty_strided         1.63%      33.490us         1.63%      33.490us       5.582us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        11.70%     241.040us        11.70%     241.040us      40.173us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.54%      31.672us         1.96%      40.421us       3.368us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.42%       8.749us         0.42%       8.749us       0.729us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         1.93%      39.751us         1.93%      39.751us       6.625us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.23%       4.681us         0.23%       4.681us       4.681us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     370.847us      1148.52%     370.847us     370.847us             1  
+                                      hf_kernels_rotary         8.48%     171.185us        99.77%       2.015ms       2.015ms       0.000us         0.00%      34.081us      34.081us             1  
+                          _rotary_dba7d1e::apply_rotary         2.32%      46.763us         4.49%      90.723us      15.120us      21.793us        67.49%      21.793us       3.632us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      21.793us        67.49%      21.793us       3.632us             6  
+                                            aten::clone         1.25%      25.309us        84.59%       1.708ms     284.718us       0.000us         0.00%      12.288us       2.048us             6  
+                                            aten::copy_         1.96%      39.631us        81.62%       1.648ms     274.723us      10.496us        32.51%      12.288us       2.048us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.496us        32.51%      10.496us       1.749us             6  
+                                Activity Buffer Request        70.18%       1.417ms        70.18%       1.417ms       1.417ms       1.792us         5.55%       1.792us       1.792us             1  
+                                    aten::empty_strided         1.72%      34.661us         1.72%      34.661us       5.777us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         9.48%     191.424us         9.48%     191.424us      31.904us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.73%      34.932us         2.22%      44.771us       3.731us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.49%       9.839us         0.49%       9.839us       0.820us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.18%      43.960us         2.18%      43.960us       7.327us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.23%       4.601us         0.23%       4.601us       4.601us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.060ms
-Self CUDA time total: 32.416us
+Self CPU time total: 2.020ms
+Self CUDA time total: 32.289us
 
 
 
@@ -4171,23 +4171,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     349.021us       674.53%     349.021us     349.021us             1  
-                                      hf_kernels_rotary         8.13%     167.188us        99.77%       2.053ms       2.053ms       0.000us         0.00%      54.656us      54.656us             1  
-                          _rotary_dba7d1e::apply_rotary         2.05%      42.101us         4.09%      84.171us      14.029us      34.590us        66.85%      34.590us       5.765us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      34.590us        66.85%      34.590us       5.765us             6  
-                                            aten::clone         1.20%      24.743us        85.45%       1.758ms     292.975us       0.000us         0.00%      20.066us       3.344us             6  
-                                            aten::copy_         1.77%      36.360us        82.61%       1.700ms     283.256us      17.153us        33.15%      20.066us       3.344us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.153us        33.15%      17.153us       2.859us             6  
-                                Activity Buffer Request        69.27%       1.425ms        69.27%       1.425ms       1.425ms       2.913us         5.63%       2.913us       2.913us             1  
-                                    aten::empty_strided         1.63%      33.571us         1.63%      33.571us       5.595us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        11.58%     238.157us        11.58%     238.157us      39.693us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.68%      34.499us         2.11%      43.362us       3.614us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.43%       8.863us         0.43%       8.863us       0.739us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         2.04%      42.070us         2.04%      42.070us       7.012us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.23%       4.701us         0.23%       4.701us       4.701us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     345.533us       668.21%     345.533us     345.533us             1  
+                                      hf_kernels_rotary         8.13%     161.677us        99.76%       1.983ms       1.983ms       0.000us         0.00%      54.558us      54.558us             1  
+                          _rotary_dba7d1e::apply_rotary         2.15%      42.810us         4.29%      85.240us      14.207us      34.782us        67.26%      34.782us       5.797us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      34.782us        67.26%      34.782us       5.797us             6  
+                                            aten::clone         1.16%      23.089us        85.02%       1.690ms     281.665us       0.000us         0.00%      19.776us       3.296us             6  
+                                            aten::copy_         1.78%      35.482us        82.32%       1.636ms     272.722us      16.928us        32.74%      19.776us       3.296us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.928us        32.74%      16.928us       2.821us             6  
+                                Activity Buffer Request        71.53%       1.422ms        71.53%       1.422ms       1.422ms       2.848us         5.51%       2.848us       2.848us             1  
+                                    aten::empty_strided         1.54%      30.571us         1.54%      30.571us       5.095us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         9.00%     178.904us         9.00%     178.904us      29.817us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.84%      36.581us         2.32%      46.051us       3.838us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.48%       9.470us         0.48%       9.470us       0.789us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.13%      42.430us         2.13%      42.430us       7.072us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.24%       4.870us         0.24%       4.870us       4.870us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.057ms
-Self CUDA time total: 51.743us
+Self CPU time total: 1.988ms
+Self CUDA time total: 51.710us
 
 
 
@@ -4197,23 +4197,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     342.845us      1058.69%     342.845us     342.845us             1  
-                                      hf_kernels_rotary         7.95%     162.638us        99.78%       2.041ms       2.041ms       0.000us         0.00%      34.176us      34.176us             1  
-                          _rotary_dba7d1e::apply_rotary         2.08%      42.501us         4.07%      83.221us      13.870us      21.760us        67.19%      21.760us       3.627us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      21.760us        67.19%      21.760us       3.627us             6  
-                                            aten::clone         1.16%      23.762us        85.72%       1.754ms     292.258us       0.000us         0.00%      12.416us       2.069us             6  
-                                            aten::copy_         1.82%      37.190us        83.02%       1.698ms     283.036us      10.624us        32.81%      12.416us       2.069us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.624us        32.81%      10.624us       1.771us             6  
-                                Activity Buffer Request        69.60%       1.424ms        69.60%       1.424ms       1.424ms       1.792us         5.53%       1.792us       1.792us             1  
-                                    aten::empty_strided         1.54%      31.570us         1.54%      31.570us       5.262us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        11.60%     237.247us        11.60%     237.247us      39.541us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.62%      33.195us         2.03%      41.584us       3.465us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.41%       8.389us         0.41%       8.389us       0.699us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         1.99%      40.720us         1.99%      40.720us       6.787us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.22%       4.600us         0.22%       4.600us       4.600us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     338.136us      1047.28%     338.136us     338.136us             1  
+                                      hf_kernels_rotary        19.11%     157.801us        99.43%     820.869us     820.869us       0.000us         0.00%      34.078us      34.078us             1  
+                          _rotary_dba7d1e::apply_rotary         5.12%      42.269us        10.18%      84.080us      14.013us      21.792us        67.49%      21.792us       3.632us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      21.792us        67.49%      21.792us       3.632us             6  
+                                            aten::clone         2.56%      21.133us        65.13%     537.684us      89.614us       0.000us         0.00%      12.286us       2.048us             6  
+                                            aten::copy_         4.56%      37.650us        58.77%     485.172us      80.862us      10.495us        32.51%      12.286us       2.048us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.495us        32.51%      10.495us       1.749us             6  
+                                Activity Buffer Request        32.51%     268.347us        32.51%     268.347us     268.347us       1.791us         5.55%       1.791us       1.791us             1  
+                                    aten::empty_strided         3.80%      31.379us         3.80%      31.379us       5.230us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        21.70%     179.175us        21.70%     179.175us      29.862us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.93%      32.405us         5.00%      41.304us       3.442us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.08%       8.899us         1.08%       8.899us       0.742us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.06%      41.811us         5.06%      41.811us       6.969us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.57%       4.680us         0.57%       4.680us       4.680us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.046ms
-Self CUDA time total: 32.384us
+Self CPU time total: 825.549us
+Self CUDA time total: 32.287us
 
 
 
@@ -4223,23 +4223,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     345.276us       667.68%     345.276us     345.276us             1  
-                                      hf_kernels_rotary        17.87%     159.778us        99.47%     889.262us     889.262us       0.000us         0.00%      54.593us      54.593us             1  
-                          _rotary_dba7d1e::apply_rotary         4.83%      43.201us         9.55%      85.402us      14.234us      34.656us        67.02%      34.656us       5.776us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      34.656us        67.02%      34.656us       5.776us             6  
-                                            aten::clone         2.69%      24.052us        67.57%     604.071us     100.678us       0.000us         0.00%      19.937us       3.323us             6  
-                                            aten::copy_         3.98%      35.591us        61.32%     548.169us      91.362us      17.057us        32.98%      19.937us       3.323us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.057us        32.98%      17.057us       2.843us             6  
-                                Activity Buffer Request        31.28%     279.600us        31.28%     279.600us     279.600us       2.880us         5.57%       2.880us       2.880us             1  
-                                    aten::empty_strided         3.56%      31.850us         3.56%      31.850us       5.308us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        26.06%     232.978us        26.06%     232.978us      38.830us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.51%      31.369us         4.48%      40.011us       3.334us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.97%       8.642us         0.97%       8.642us       0.720us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         4.72%      42.201us         4.72%      42.201us       7.034us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.53%       4.740us         0.53%       4.740us       4.740us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     347.832us       672.66%     347.832us     347.832us             1  
+                                      hf_kernels_rotary        18.98%     156.996us        99.42%     822.501us     822.501us       0.000us         0.00%      54.558us      54.558us             1  
+                          _rotary_dba7d1e::apply_rotary         5.15%      42.621us        10.22%      84.512us      14.085us      34.783us        67.27%      34.783us       5.797us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      34.783us        67.27%      34.783us       5.797us             6  
+                                            aten::clone         2.65%      21.930us        64.92%     537.102us      89.517us       0.000us         0.00%      19.775us       3.296us             6  
+                                            aten::copy_         4.53%      37.450us        58.33%     482.542us      80.424us      16.927us        32.73%      19.775us       3.296us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.927us        32.73%      16.927us       2.821us             6  
+                                Activity Buffer Request        32.06%     265.247us        32.06%     265.247us     265.247us       2.848us         5.51%       2.848us       2.848us             1  
+                                    aten::empty_strided         3.94%      32.630us         3.94%      32.630us       5.438us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        21.74%     179.845us        21.74%     179.845us      29.974us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.14%      34.239us         5.31%      43.891us       3.658us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.17%       9.652us         1.17%       9.652us       0.804us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.06%      41.891us         5.06%      41.891us       6.982us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.58%       4.770us         0.58%       4.770us       4.770us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 894.002us
-Self CUDA time total: 51.713us
+Self CPU time total: 827.271us
+Self CUDA time total: 51.710us
 
 
 
@@ -4249,23 +4249,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     372.345us       343.04%     372.345us     372.345us             1  
-                                      hf_kernels_rotary        19.45%     178.278us        99.48%     911.643us     911.643us       0.000us         0.00%     126.592us     126.592us             1  
-                                            aten::clone         2.39%      21.900us        65.33%     598.671us      99.778us       0.000us         0.00%      69.792us      11.632us             6  
-                                            aten::copy_         4.20%      38.503us        59.48%     545.071us      90.845us      51.744us        47.67%      69.792us      11.632us             6  
-                          _rotary_dba7d1e::apply_rotary         5.03%      46.070us         9.81%      89.853us      14.975us      56.800us        52.33%      56.800us       9.467us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      56.800us        52.33%      56.800us       9.467us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      51.744us        47.67%      51.744us       8.624us             6  
-                                Activity Buffer Request        29.76%     272.689us        29.76%     272.689us     272.689us      18.048us        16.63%      18.048us      18.048us             1  
-                                    aten::empty_strided         3.46%      31.700us         3.46%      31.700us       5.283us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        25.52%     233.879us        25.52%     233.879us      38.980us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.90%      35.730us         4.89%      44.841us       3.737us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.99%       9.111us         0.99%       9.111us       0.759us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         4.78%      43.783us         4.78%      43.783us       7.297us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.52%       4.730us         0.52%       4.730us       4.730us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     352.413us       323.34%     352.413us     352.413us             1  
+                                      hf_kernels_rotary        18.38%     152.793us        99.44%     826.801us     826.801us       0.000us         0.00%     127.423us     127.423us             1  
+                                            aten::clone         2.64%      21.959us        64.91%     539.754us      89.959us       0.000us         0.00%      69.984us      11.664us             6  
+                                            aten::copy_         4.48%      37.251us        58.50%     486.434us      81.072us      51.552us        47.30%      69.984us      11.664us             6  
+                          _rotary_dba7d1e::apply_rotary         5.35%      44.522us        10.55%      87.704us      14.617us      57.439us        52.70%      57.439us       9.573us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      57.439us        52.70%      57.439us       9.573us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      51.552us        47.30%      51.552us       8.592us             6  
+                                Activity Buffer Request        32.52%     270.437us        32.52%     270.437us     270.437us      18.432us        16.91%      18.432us      18.432us             1  
+                                    aten::empty_strided         3.77%      31.361us         3.77%      31.361us       5.227us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        21.50%     178.746us        21.50%     178.746us      29.791us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.45%      36.960us         5.60%      46.550us       3.879us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.15%       9.590us         1.15%       9.590us       0.799us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.19%      43.182us         5.19%      43.182us       7.197us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.56%       4.690us         0.56%       4.690us       4.690us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 916.373us
-Self CUDA time total: 108.544us
+Self CPU time total: 831.491us
+Self CUDA time total: 108.991us
 
 
 
@@ -4275,23 +4275,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     373.881us       208.27%     373.881us     373.881us             1  
-                                      hf_kernels_rotary        17.56%     156.837us        99.52%     888.752us     888.752us       0.000us         0.00%     203.231us     203.231us             1  
-                                            aten::clone         2.51%      22.450us        65.45%     584.500us      97.417us       0.000us         0.00%     102.431us      17.072us             6  
-                                            aten::copy_         4.24%      37.839us        59.27%     529.299us      88.217us      78.719us        43.85%     102.431us      17.072us             6  
-                          _rotary_dba7d1e::apply_rotary         4.89%      43.682us        11.68%     104.316us      17.386us     100.800us        56.15%     100.800us      16.800us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us     100.800us        56.15%     100.800us      16.800us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      78.719us        43.85%      78.719us      13.120us             6  
-                                Activity Buffer Request        29.56%     264.020us        29.56%     264.020us     264.020us      23.712us        13.21%      23.712us      23.712us             1  
-                                    aten::empty_strided         3.67%      32.751us         3.67%      32.751us       5.458us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        25.47%     227.440us        25.47%     227.440us      37.907us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.79%      33.838us         4.83%      43.099us       3.592us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.04%       9.261us         1.04%       9.261us       0.772us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         6.79%      60.634us         6.79%      60.634us      10.106us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.48%       4.320us         0.48%       4.320us       4.320us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     354.429us       196.77%     354.429us     354.429us             1  
+                                      hf_kernels_rotary        18.96%     156.272us        99.48%     819.980us     819.980us       0.000us         0.00%     203.900us     203.900us             1  
+                                            aten::clone         2.73%      22.479us        64.84%     534.473us      89.079us       0.000us         0.00%     102.557us      17.093us             6  
+                                            aten::copy_         4.31%      35.551us        58.35%     480.933us      80.156us      78.782us        43.74%     102.557us      17.093us             6  
+                          _rotary_dba7d1e::apply_rotary         5.14%      42.393us        10.35%      85.274us      14.212us     101.343us        56.26%     101.343us      16.890us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us     101.343us        56.26%     101.343us      16.890us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      78.782us        43.74%      78.782us      13.130us             6  
+                                Activity Buffer Request        32.52%     268.027us        32.52%     268.027us     268.027us      23.775us        13.20%      23.775us      23.775us             1  
+                                    aten::empty_strided         3.77%      31.061us         3.77%      31.061us       5.177us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        21.52%     177.355us        21.52%     177.355us      29.559us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.12%      33.982us         5.33%      43.961us       3.663us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.21%       9.979us         1.21%       9.979us       0.832us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.20%      42.881us         5.20%      42.881us       7.147us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.52%       4.300us         0.52%       4.300us       4.300us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 893.072us
-Self CUDA time total: 179.519us
+Self CPU time total: 824.280us
+Self CUDA time total: 180.125us
 
 
 
@@ -4301,23 +4301,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     339.068us      1293.81%     339.068us     339.068us             1  
-                                      hf_kernels_rotary        18.21%     158.266us        99.46%     864.691us     864.691us       0.000us         0.00%      27.359us      27.359us             1  
-                          _rotary_dba7d1e::apply_rotary         4.98%      43.284us         9.71%      84.425us      14.071us      19.391us        73.99%      19.391us       3.232us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      19.391us        73.99%      19.391us       3.232us             6  
-                                            aten::clone         2.67%      23.179us        66.79%     580.620us      96.770us       0.000us         0.00%       7.968us       1.328us             6  
-                                            aten::copy_         4.38%      38.042us        60.58%     526.630us      87.772us       6.816us        26.01%       7.968us       1.328us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.816us        26.01%       6.816us       1.136us             6  
-                                Activity Buffer Request        29.98%     260.620us        29.98%     260.620us     260.620us       1.152us         4.40%       1.152us       1.152us             1  
-                                    aten::empty_strided         3.54%      30.811us         3.54%      30.811us       5.135us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        26.22%     227.968us        26.22%     227.968us      37.995us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.77%      32.731us         4.76%      41.380us       3.448us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.99%       8.649us         0.99%       8.649us       0.721us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         4.73%      41.141us         4.73%      41.141us       6.857us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.54%       4.651us         0.54%       4.651us       4.651us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     338.587us      1293.50%     338.587us     338.587us             1  
+                                      hf_kernels_rotary        19.34%     157.366us        99.42%     808.960us     808.960us       0.000us         0.00%      27.296us      27.296us             1  
+                          _rotary_dba7d1e::apply_rotary         5.26%      42.761us        10.55%      85.842us      14.307us      19.392us        74.08%      19.392us       3.232us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      19.392us        74.08%      19.392us       3.232us             6  
+                                            aten::clone         2.60%      21.121us        64.41%     524.052us      87.342us       0.000us         0.00%       7.904us       1.317us             6  
+                                            aten::copy_         4.60%      37.442us        58.06%     472.441us      78.740us       6.784us        25.92%       7.904us       1.317us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.784us        25.92%       6.784us       1.131us             6  
+                                Activity Buffer Request        31.61%     257.196us        31.61%     257.196us     257.196us       1.120us         4.28%       1.120us       1.120us             1  
+                                    aten::empty_strided         3.75%      30.490us         3.75%      30.490us       5.082us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        21.85%     177.803us        21.85%     177.803us      29.634us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.95%      32.140us         5.12%      41.700us       3.475us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.17%       9.560us         1.17%       9.560us       0.797us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.29%      43.081us         5.29%      43.081us       7.180us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.58%       4.711us         0.58%       4.711us       4.711us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 869.342us
-Self CUDA time total: 26.207us
+Self CPU time total: 813.671us
+Self CUDA time total: 26.176us
 
 
 
@@ -4327,23 +4327,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     345.689us      1259.02%     345.689us     345.689us             1  
-                                      hf_kernels_rotary        18.17%     159.455us        99.46%     872.870us     872.870us       0.000us         0.00%      28.769us      28.769us             1  
-                          _rotary_dba7d1e::apply_rotary         4.92%      43.180us         9.80%      85.973us      14.329us      19.616us        71.44%      19.616us       3.269us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      19.616us        71.44%      19.616us       3.269us             6  
-                                            aten::clone         2.64%      23.140us        66.83%     586.460us      97.743us       0.000us         0.00%       9.153us       1.526us             6  
-                                            aten::copy_         4.27%      37.430us        60.39%     529.960us      88.327us       7.841us        28.56%       9.153us       1.526us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.841us        28.56%       7.841us       1.307us             6  
-                                Activity Buffer Request        29.89%     262.350us        29.89%     262.350us     262.350us       1.312us         4.78%       1.312us       1.312us             1  
-                                    aten::empty_strided         3.80%      33.360us         3.80%      33.360us       5.560us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        26.23%     230.180us        26.23%     230.180us      38.363us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.66%      32.161us         4.67%      40.982us       3.415us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.01%       8.821us         1.01%       8.821us       0.735us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         4.88%      42.793us         4.88%      42.793us       7.132us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.54%       4.730us         0.54%       4.730us       4.730us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     349.862us      1278.50%     349.862us     349.862us             1  
+                                      hf_kernels_rotary        19.32%     156.134us        99.42%     803.460us     803.460us       0.000us         0.00%      28.709us      28.709us             1  
+                          _rotary_dba7d1e::apply_rotary         5.33%      43.099us        10.84%      87.643us      14.607us      19.428us        71.00%      19.428us       3.238us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      19.428us        71.00%      19.428us       3.238us             6  
+                                            aten::clone         2.80%      22.600us        63.71%     514.893us      85.816us       0.000us         0.00%       9.281us       1.547us             6  
+                                            aten::copy_         4.89%      39.481us        56.99%     460.582us      76.764us       7.937us        29.00%       9.281us       1.547us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.937us        29.00%       7.937us       1.323us             6  
+                                Activity Buffer Request        27.85%     225.076us        27.85%     225.076us     225.076us       1.344us         4.91%       1.344us       1.344us             1  
+                                    aten::empty_strided         3.92%      31.711us         3.92%      31.711us       5.285us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        24.26%     196.025us        24.26%     196.025us      32.671us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.38%      35.400us         5.54%      44.790us       3.732us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.16%       9.390us         1.16%       9.390us       0.782us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.51%      44.544us         5.51%      44.544us       7.424us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.58%       4.720us         0.58%       4.720us       4.720us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 877.600us
-Self CUDA time total: 27.457us
+Self CPU time total: 808.180us
+Self CUDA time total: 27.365us
 
 
 
@@ -4353,23 +4353,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     352.280us      1238.42%     352.280us     352.280us             1  
-                                      hf_kernels_rotary        18.63%     163.526us        99.48%     873.041us     873.041us       0.000us         0.00%      29.790us      29.790us             1  
-                          _rotary_dba7d1e::apply_rotary         4.98%      43.742us         9.85%      86.414us      14.402us      20.606us        72.44%      20.606us       3.434us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      20.606us        72.44%      20.606us       3.434us             6  
-                                            aten::clone         2.59%      22.720us        66.23%     581.279us      96.880us       0.000us         0.00%       9.184us       1.531us             6  
-                                            aten::copy_         4.14%      36.351us        59.98%     526.379us      87.730us       7.840us        27.56%       9.184us       1.531us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.840us        27.56%       7.840us       1.307us             6  
-                                Activity Buffer Request        30.03%     263.549us        30.03%     263.549us     263.549us       1.344us         4.72%       1.344us       1.344us             1  
-                                    aten::empty_strided         3.67%      32.180us         3.67%      32.180us       5.363us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        25.81%     226.479us        25.81%     226.479us      37.747us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.76%      33.033us         4.77%      41.822us       3.485us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.00%       8.789us         1.00%       8.789us       0.732us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         4.86%      42.672us         4.86%      42.672us       7.112us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.52%       4.560us         0.52%       4.560us       4.560us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     349.981us      1235.85%     349.981us     349.981us             1  
+                                      hf_kernels_rotary         8.03%     161.215us        99.76%       2.003ms       2.003ms       0.000us         0.00%      29.663us      29.663us             1  
+                          _rotary_dba7d1e::apply_rotary         2.11%      42.422us         4.23%      84.982us      14.164us      20.544us        72.54%      20.544us       3.424us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      20.544us        72.54%      20.544us       3.424us             6  
+                                            aten::clone         1.12%      22.572us        85.29%       1.712ms     285.349us       0.000us         0.00%       9.119us       1.520us             6  
+                                            aten::copy_         1.91%      38.260us        82.54%       1.657ms     276.143us       7.775us        27.46%       9.119us       1.520us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.775us        27.46%       7.775us       1.296us             6  
+                                Activity Buffer Request        71.67%       1.439ms        71.67%       1.439ms       1.439ms       1.344us         4.75%       1.344us       1.344us             1  
+                                    aten::empty_strided         1.63%      32.660us         1.63%      32.660us       5.443us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.96%     179.936us         8.96%     179.936us      29.989us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.74%      34.910us         2.20%      44.250us       3.688us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.47%       9.340us         0.47%       9.340us       0.778us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.12%      42.560us         2.12%      42.560us       7.093us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.24%       4.741us         0.24%       4.741us       4.741us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 877.601us
-Self CUDA time total: 28.446us
+Self CPU time total: 2.007ms
+Self CUDA time total: 28.319us
 
 
 
@@ -4379,23 +4379,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     341.881us       953.86%     341.881us     341.881us             1  
-                                      hf_kernels_rotary        17.61%     155.956us        99.45%     880.921us     880.921us       0.000us         0.00%      37.634us      37.634us             1  
-                          _rotary_dba7d1e::apply_rotary         4.86%      43.060us         9.73%      86.184us      14.364us      25.312us        70.62%      25.312us       4.219us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      25.312us        70.62%      25.312us       4.219us             6  
-                                            aten::clone         2.52%      22.319us        67.43%     597.290us      99.548us       0.000us         0.00%      12.322us       2.054us             6  
-                                            aten::copy_         4.12%      36.502us        61.34%     543.331us      90.555us      10.530us        29.38%      12.322us       2.054us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.530us        29.38%      10.530us       1.755us             6  
-                                Activity Buffer Request        31.67%     280.550us        31.67%     280.550us     280.550us       1.792us         5.00%       1.792us       1.792us             1  
-                                    aten::empty_strided         3.57%      31.640us         3.57%      31.640us       5.273us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        25.54%     226.279us        25.54%     226.279us      37.713us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.70%      32.812us         4.68%      41.491us       3.458us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.98%       8.679us         0.98%       8.679us       0.723us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         4.87%      43.124us         4.87%      43.124us       7.187us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.55%       4.910us         0.55%       4.910us       4.910us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     346.238us       971.27%     346.238us     346.238us             1  
+                                      hf_kernels_rotary         8.04%     160.124us        99.76%       1.988ms       1.988ms       0.000us         0.00%      37.440us      37.440us             1  
+                          _rotary_dba7d1e::apply_rotary         2.20%      43.921us         4.24%      84.493us      14.082us      25.216us        70.74%      25.216us       4.203us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      25.216us        70.74%      25.216us       4.203us             6  
+                                            aten::clone         1.14%      22.762us        85.30%       1.700ms     283.325us       0.000us         0.00%      12.224us       2.037us             6  
+                                            aten::copy_         1.84%      36.620us        82.53%       1.645ms     274.105us      10.432us        29.26%      12.224us       2.037us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.432us        29.26%      10.432us       1.739us             6  
+                                Activity Buffer Request        71.70%       1.429ms        71.70%       1.429ms       1.429ms       1.792us         5.03%       1.792us       1.792us             1  
+                                    aten::empty_strided         1.63%      32.561us         1.63%      32.561us       5.427us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.99%     179.114us         8.99%     179.114us      29.852us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.72%      34.250us         2.18%      43.390us       3.616us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.46%       9.140us         0.46%       9.140us       0.762us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.04%      40.572us         2.04%      40.572us       6.762us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.24%       4.860us         0.24%       4.860us       4.860us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 885.831us
-Self CUDA time total: 35.842us
+Self CPU time total: 1.993ms
+Self CUDA time total: 35.648us
 
 
 
@@ -4405,23 +4405,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     348.158us      1221.01%     348.158us     348.158us             1  
-                                      hf_kernels_rotary         7.73%     158.832us        99.76%       2.051ms       2.051ms       0.000us         0.00%      29.858us      29.858us             1  
-                          _rotary_dba7d1e::apply_rotary         2.18%      44.723us         4.13%      84.825us      14.138us      20.674us        72.50%      20.674us       3.446us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      20.674us        72.50%      20.674us       3.446us             6  
-                                            aten::clone         1.24%      25.490us        85.81%       1.764ms     294.032us       0.000us         0.00%       9.184us       1.531us             6  
-                                            aten::copy_         1.80%      37.082us        83.01%       1.707ms     284.462us       7.840us        27.50%       9.184us       1.531us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.840us        27.50%       7.840us       1.307us             6  
-                                Activity Buffer Request        70.14%       1.442ms        70.14%       1.442ms       1.442ms       1.344us         4.71%       1.344us       1.344us             1  
-                                    aten::empty_strided         1.55%      31.931us         1.55%      31.931us       5.322us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        11.07%     227.598us        11.07%     227.598us      37.933us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.67%      34.312us         2.11%      43.312us       3.609us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.44%       9.000us         0.44%       9.000us       0.750us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         1.95%      40.102us         1.95%      40.102us       6.684us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.24%       4.880us         0.24%       4.880us       4.880us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     347.675us      1229.10%     347.675us     347.675us             1  
+                                      hf_kernels_rotary         8.06%     160.274us        99.76%       1.984ms       1.984ms       0.000us         0.00%      29.631us      29.631us             1  
+                          _rotary_dba7d1e::apply_rotary         2.18%      43.331us         4.28%      85.164us      14.194us      20.511us        72.51%      20.511us       3.418us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      20.511us        72.51%      20.511us       3.418us             6  
+                                            aten::clone         1.13%      22.531us        85.26%       1.696ms     282.610us       0.000us         0.00%       9.120us       1.520us             6  
+                                            aten::copy_         1.97%      39.252us        82.52%       1.641ms     273.528us       7.776us        27.49%       9.120us       1.520us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.776us        27.49%       7.776us       1.296us             6  
+                                Activity Buffer Request        71.58%       1.424ms        71.58%       1.424ms       1.424ms       1.344us         4.75%       1.344us       1.344us             1  
+                                    aten::empty_strided         1.61%      31.959us         1.61%      31.959us       5.326us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.97%     178.354us         8.97%     178.354us      29.726us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.68%      33.430us         2.16%      42.920us       3.577us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.48%       9.490us         0.48%       9.490us       0.791us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.10%      41.833us         2.10%      41.833us       6.972us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.24%       4.801us         0.24%       4.801us       4.801us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.056ms
-Self CUDA time total: 28.514us
+Self CPU time total: 1.989ms
+Self CUDA time total: 28.287us
 
 
 
@@ -4431,23 +4431,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     344.320us       959.86%     344.320us     344.320us             1  
-                                      hf_kernels_rotary        18.29%     156.315us        99.44%     849.960us     849.960us       0.000us         0.00%      37.664us      37.664us             1  
-                          _rotary_dba7d1e::apply_rotary         5.15%      43.990us        10.72%      91.654us      15.276us      25.312us        70.56%      25.312us       4.219us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      25.312us        70.56%      25.312us       4.219us             6  
-                                            aten::clone         2.62%      22.368us        65.70%     561.560us      93.593us       0.000us         0.00%      12.352us       2.059us             6  
-                                            aten::copy_         4.13%      35.283us        59.24%     506.308us      84.385us      10.560us        29.44%      12.352us       2.059us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.560us        29.44%      10.560us       1.760us             6  
-                                Activity Buffer Request        29.39%     251.239us        29.39%     251.239us     251.239us       1.792us         5.00%       1.792us       1.792us             1  
-                                    aten::empty_strided         3.85%      32.884us         3.85%      32.884us       5.481us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        25.71%     219.786us        25.71%     219.786us      36.631us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.67%      31.402us         4.73%      40.431us       3.369us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.06%       9.029us         1.06%       9.029us       0.752us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         5.58%      47.664us         5.58%      47.664us       7.944us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.56%       4.781us         0.56%       4.781us       4.781us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     341.434us       959.52%     341.434us     341.434us             1  
+                                      hf_kernels_rotary        20.68%     156.375us        99.37%     751.248us     751.248us       0.000us         0.00%      37.312us      37.312us             1  
+                          _rotary_dba7d1e::apply_rotary         5.66%      42.780us        11.14%      84.232us      14.039us      25.184us        70.77%      25.184us       4.197us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      25.184us        70.77%      25.184us       4.197us             6  
+                                            aten::clone         3.01%      22.779us        61.92%     468.081us      78.014us       0.000us         0.00%      12.128us       2.021us             6  
+                                            aten::copy_         4.78%      36.161us        54.65%     413.150us      68.858us      10.400us        29.23%      12.128us       2.021us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.400us        29.23%      10.400us       1.733us             6  
+                                Activity Buffer Request        26.22%     198.225us        26.22%     198.225us     198.225us       1.728us         4.86%       1.728us       1.728us             1  
+                                    aten::empty_strided         4.25%      32.152us         4.25%      32.152us       5.359us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        23.65%     178.764us        23.65%     178.764us      29.794us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.40%      33.290us         5.63%      42.560us       3.547us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.23%       9.270us         1.23%       9.270us       0.773us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.48%      41.452us         5.48%      41.452us       6.909us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.63%       4.741us         0.63%       4.741us       4.741us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 854.741us
-Self CUDA time total: 35.872us
+Self CPU time total: 755.989us
+Self CUDA time total: 35.584us
 
 
 
@@ -4457,23 +4457,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     335.158us       593.10%     335.158us     335.158us             1  
-                                      hf_kernels_rotary        18.22%     154.324us        99.44%     842.379us     842.379us       0.000us         0.00%      59.390us      59.390us             1  
-                          _rotary_dba7d1e::apply_rotary         4.99%      42.273us         9.84%      83.374us      13.896us      39.454us        69.82%      39.454us       6.576us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      39.454us        69.82%      39.454us       6.576us             6  
-                                            aten::clone         2.56%      21.663us        66.58%     564.010us      94.002us       0.000us         0.00%      19.936us       3.323us             6  
-                                            aten::copy_         4.16%      35.260us        60.33%     511.017us      85.169us      17.056us        30.18%      19.936us       3.323us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.056us        30.18%      17.056us       2.843us             6  
-                                Activity Buffer Request        30.26%     256.319us        30.26%     256.319us     256.319us       2.880us         5.10%       2.880us       2.880us             1  
-                                    aten::empty_strided         3.70%      31.330us         3.70%      31.330us       5.222us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        25.90%     219.438us        25.90%     219.438us      36.573us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.75%      31.762us         4.80%      40.671us       3.389us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.05%       8.909us         1.05%       8.909us       0.742us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         4.85%      41.101us         4.85%      41.101us       6.850us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.56%       4.710us         0.56%       4.710us       4.710us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     349.886us       617.06%     349.886us     349.886us             1  
+                                      hf_kernels_rotary        15.93%     158.238us        99.46%     988.285us     988.285us       0.000us         0.00%      59.582us      59.582us             1  
+                          _rotary_dba7d1e::apply_rotary         4.43%      44.009us         8.77%      87.171us      14.528us      39.742us        70.09%      39.742us       6.624us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      39.742us        70.09%      39.742us       6.624us             6  
+                                            aten::clone         2.20%      21.907us        70.33%     698.845us     116.474us       0.000us         0.00%      19.840us       3.307us             6  
+                                            aten::copy_         3.76%      37.392us        65.02%     646.067us     107.678us      16.960us        29.91%      19.840us       3.307us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.960us        29.91%      16.960us       2.827us             6  
+                                Activity Buffer Request        43.30%     430.221us        43.30%     430.221us     430.221us       2.880us         5.08%       2.880us       2.880us             1  
+                                    aten::empty_strided         3.11%      30.871us         3.11%      30.871us       5.145us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        17.96%     178.454us        17.96%     178.454us      29.742us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.43%      34.051us         4.43%      44.031us       3.669us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.00%       9.980us         1.00%       9.980us       0.832us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         4.34%      43.162us         4.34%      43.162us       7.194us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.54%       5.320us         0.54%       5.320us       5.320us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 847.089us
-Self CUDA time total: 56.510us
+Self CPU time total: 993.605us
+Self CUDA time total: 56.702us
 
 
 
@@ -4483,23 +4483,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     369.080us       312.82%     369.080us     369.080us             1  
-                                      hf_kernels_rotary        20.18%     177.506us        99.45%     874.621us     874.621us       0.000us         0.00%     134.912us     134.912us             1  
-                                            aten::clone         2.49%      21.878us        64.31%     565.600us      94.267us       0.000us         0.00%      69.696us      11.616us             6  
-                                            aten::copy_         4.23%      37.163us        58.33%     512.969us      85.495us      52.768us        44.72%      69.696us      11.616us             6  
-                          _rotary_dba7d1e::apply_rotary         5.24%      46.042us        10.09%      88.704us      14.784us      65.216us        55.28%      65.216us      10.869us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      65.216us        55.28%      65.216us      10.869us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      52.768us        44.72%      52.768us       8.795us             6  
-                                Activity Buffer Request        28.97%     254.819us        28.97%     254.819us     254.819us      16.928us        14.35%      16.928us      16.928us             1  
-                                    aten::empty_strided         3.50%      30.753us         3.50%      30.753us       5.126us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        25.13%     220.987us        25.13%     220.987us      36.831us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.86%      33.990us         4.87%      42.811us       3.568us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.00%       8.821us         1.00%       8.821us       0.735us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         4.85%      42.662us         4.85%      42.662us       7.110us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.55%       4.870us         0.55%       4.870us       4.870us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     352.574us       297.38%     352.574us     352.574us             1  
+                                      hf_kernels_rotary        18.56%     157.003us        99.43%     841.041us     841.041us       0.000us         0.00%     135.680us     135.680us             1  
+                                            aten::clone         2.59%      21.881us        65.75%     556.174us      92.696us       0.000us         0.00%      69.984us      11.664us             6  
+                                            aten::copy_         4.37%      36.992us        59.34%     501.912us      83.652us      52.864us        44.59%      69.984us      11.664us             6  
+                          _rotary_dba7d1e::apply_rotary         5.11%      43.221us        10.14%      85.754us      14.292us      65.696us        55.41%      65.696us      10.949us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      65.696us        55.41%      65.696us      10.949us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      52.864us        44.59%      52.864us       8.811us             6  
+                                Activity Buffer Request        33.65%     284.597us        33.65%     284.597us     284.597us      17.120us        14.44%      17.120us      17.120us             1  
+                                    aten::empty_strided         3.83%      32.381us         3.83%      32.381us       5.397us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        21.32%     180.323us        21.32%     180.323us      30.054us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.89%      32.880us         4.98%      42.110us       3.509us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.09%       9.230us         1.09%       9.230us       0.769us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.03%      42.533us         5.03%      42.533us       7.089us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.57%       4.810us         0.57%       4.810us       4.810us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 879.491us
-Self CUDA time total: 117.984us
+Self CPU time total: 845.851us
+Self CUDA time total: 118.560us
 
 
 
@@ -4509,23 +4509,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     360.471us       637.52%     360.471us     360.471us             1  
-                                      hf_kernels_rotary        18.70%     161.865us        99.47%     860.760us     860.760us       0.000us         0.00%      59.391us      59.391us             1  
-                          _rotary_dba7d1e::apply_rotary         5.21%      45.111us        10.32%      89.333us      14.889us      39.487us        69.84%      39.487us       6.581us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      39.487us        69.84%      39.487us       6.581us             6  
-                                            aten::clone         2.76%      23.842us        65.28%     564.941us      94.157us       0.000us         0.00%      19.904us       3.317us             6  
-                                            aten::copy_         4.31%      37.312us        58.89%     509.589us      84.931us      17.056us        30.16%      19.904us       3.317us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.056us        30.16%      17.056us       2.843us             6  
-                                Activity Buffer Request        29.00%     250.989us        29.00%     250.989us     250.989us       2.848us         5.04%       2.848us       2.848us             1  
-                                    aten::empty_strided         3.64%      31.510us         3.64%      31.510us       5.252us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        25.57%     221.288us        25.57%     221.288us      36.881us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.04%      34.983us         5.16%      44.621us       3.718us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.11%       9.638us         1.11%       9.638us       0.803us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         5.11%      44.222us         5.11%      44.222us       7.370us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.53%       4.600us         0.53%       4.600us       4.600us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     341.982us       603.45%     341.982us     341.982us             1  
+                                      hf_kernels_rotary        18.98%     155.712us        99.43%     815.710us     815.710us       0.000us         0.00%      59.487us      59.487us             1  
+                          _rotary_dba7d1e::apply_rotary         5.25%      43.112us        10.37%      85.045us      14.174us      39.839us        70.30%      39.839us       6.640us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      39.839us        70.30%      39.839us       6.640us             6  
+                                            aten::clone         2.51%      20.600us        64.82%     531.763us      88.627us       0.000us         0.00%      19.648us       3.275us             6  
+                                            aten::copy_         4.52%      37.100us        58.54%     480.262us      80.044us      16.832us        29.70%      19.648us       3.275us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.832us        29.70%      16.832us       2.805us             6  
+                                Activity Buffer Request        32.45%     266.237us        32.45%     266.237us     266.237us       2.816us         4.97%       2.816us       2.816us             1  
+                                    aten::empty_strided         3.77%      30.901us         3.77%      30.901us       5.150us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        21.57%     176.925us        21.57%     176.925us      29.488us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.05%      33.240us         5.26%      43.190us       3.599us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.21%       9.950us         1.21%       9.950us       0.829us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.11%      41.933us         5.11%      41.933us       6.989us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.57%       4.700us         0.57%       4.700us       4.700us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 865.360us
-Self CUDA time total: 56.543us
+Self CPU time total: 820.410us
+Self CUDA time total: 56.671us
 
 
 
@@ -4535,23 +4535,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     348.053us       293.57%     348.053us     348.053us             1  
-                                      hf_kernels_rotary        18.59%     158.086us        99.46%     845.630us     845.630us       0.000us         0.00%     135.933us     135.933us             1  
-                                            aten::clone         2.59%      22.020us        65.95%     560.690us      93.448us       0.000us         0.00%      70.752us      11.792us             6  
-                                            aten::copy_         4.43%      37.632us        59.68%     507.389us      84.565us      53.376us        45.02%      70.752us      11.792us             6  
-                          _rotary_dba7d1e::apply_rotary         5.16%      43.870us        10.14%      86.234us      14.372us      65.181us        54.98%      65.181us      10.864us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      65.181us        54.98%      65.181us      10.864us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.376us        45.02%      53.376us       8.896us             6  
-                                Activity Buffer Request        29.66%     252.179us        29.66%     252.179us     252.179us      17.376us        14.66%      17.376us      17.376us             1  
-                                    aten::empty_strided         3.68%      31.281us         3.68%      31.281us       5.213us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        25.59%     217.578us        25.59%     217.578us      36.263us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.78%      32.121us         4.78%      40.620us       3.385us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.00%       8.499us         1.00%       8.499us       0.708us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         4.98%      42.364us         4.98%      42.364us       7.061us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.54%       4.590us         0.54%       4.590us       4.590us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     388.726us       325.86%     388.726us     388.726us             1  
+                                      hf_kernels_rotary        19.76%     169.936us        99.45%     855.401us     855.401us       0.000us         0.00%     136.923us     136.923us             1  
+                                            aten::clone         2.64%      22.710us        63.15%     543.123us      90.521us       0.000us         0.00%      70.877us      11.813us             6  
+                                            aten::copy_         4.46%      38.370us        56.50%     485.931us      80.988us      53.246us        44.64%      70.877us      11.813us             6  
+                          _rotary_dba7d1e::apply_rotary         5.64%      48.490us        10.91%      93.801us      15.634us      66.046us        55.36%      66.046us      11.008us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      66.046us        55.36%      66.046us      11.008us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.246us        44.64%      53.246us       8.874us             6  
+                                Activity Buffer Request        30.83%     265.147us        30.83%     265.147us     265.147us      17.631us        14.78%      17.631us      17.631us             1  
+                                    aten::empty_strided         4.01%      34.482us         4.01%      34.482us       5.747us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        21.21%     182.414us        21.21%     182.414us      30.402us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.39%      37.781us         5.64%      48.541us       4.045us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.25%      10.760us         1.25%      10.760us       0.897us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.27%      45.311us         5.27%      45.311us       7.552us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.55%       4.700us         0.55%       4.700us       4.700us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 850.220us
-Self CUDA time total: 118.557us
+Self CPU time total: 860.101us
+Self CUDA time total: 119.292us
 
 
 
@@ -4561,23 +4561,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     361.432us       183.93%     361.432us     361.432us             1  
-                                      hf_kernels_rotary        18.55%     158.934us        99.44%     851.909us     851.909us       0.000us         0.00%     220.221us     220.221us             1  
-                          _rotary_dba7d1e::apply_rotary         5.09%      43.629us        10.06%      86.174us      14.362us     115.517us        58.78%     115.517us      19.253us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us     115.517us        58.78%     115.517us      19.253us             6  
-                                            aten::clone         2.64%      22.651us        66.00%     565.440us      94.240us       0.000us         0.00%     104.704us      17.451us             6  
-                                            aten::copy_         4.43%      37.970us        59.78%     512.129us      85.355us      80.992us        41.22%     104.704us      17.451us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      80.992us        41.22%      80.992us      13.499us             6  
-                                Activity Buffer Request        29.36%     251.489us        29.36%     251.489us     251.489us      23.712us        12.07%      23.712us      23.712us             1  
-                                    aten::empty_strided         3.58%      30.660us         3.58%      30.660us       5.110us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        25.99%     222.670us        25.99%     222.670us      37.112us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.80%      32.582us         4.83%      41.361us       3.447us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.02%       8.779us         1.02%       8.779us       0.732us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         4.97%      42.545us         4.97%      42.545us       7.091us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.56%       4.770us         0.56%       4.770us       4.770us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     357.115us       181.96%     357.115us     357.115us             1  
+                                      hf_kernels_rotary        18.86%     155.885us        99.43%     821.750us     821.750us       0.000us         0.00%     219.904us     219.904us             1  
+                          _rotary_dba7d1e::apply_rotary         5.36%      44.321us        10.59%      87.561us      14.594us     115.808us        59.01%     115.808us      19.301us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us     115.808us        59.01%     115.808us      19.301us             6  
+                                            aten::clone         2.51%      20.740us        64.81%     535.643us      89.274us       0.000us         0.00%     104.096us      17.349us             6  
+                                            aten::copy_         4.34%      35.891us        58.73%     485.402us      80.900us      80.448us        40.99%     104.096us      17.349us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      80.448us        40.99%      80.448us      13.408us             6  
+                                Activity Buffer Request        32.66%     269.957us        32.66%     269.957us     269.957us      23.648us        12.05%      23.648us      23.648us             1  
+                                    aten::empty_strided         3.57%      29.501us         3.57%      29.501us       4.917us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        21.72%     179.554us        21.72%     179.554us      29.926us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.97%      32.801us         5.16%      42.661us       3.555us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.19%       9.860us         1.19%       9.860us       0.822us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.23%      43.240us         5.23%      43.240us       7.207us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.57%       4.750us         0.57%       4.750us       4.750us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 856.679us
-Self CUDA time total: 196.509us
+Self CPU time total: 826.500us
+Self CUDA time total: 196.256us
 
 
 
@@ -4587,29 +4587,29 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary        12.27%     154.345us        67.03%     843.460us     843.460us       0.000us         0.00%     849.461us     849.461us             1  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     791.349us       101.00%     791.349us     791.349us             1  
-                                            aten::clone         1.79%      22.531us        44.41%     558.811us      93.135us       0.000us         0.00%     577.848us      96.308us             6  
-                                            aten::copy_         2.94%      36.962us        40.15%     505.198us      84.200us     511.865us        65.33%     577.848us      96.308us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us     511.865us        65.33%     511.865us      85.311us             6  
-                          _rotary_dba7d1e::apply_rotary         3.50%      44.071us         7.04%      88.532us      14.755us     271.613us        34.67%     271.613us      45.269us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us     271.613us        34.67%     271.613us      45.269us             6  
-                                Activity Buffer Request        20.09%     252.769us        20.09%     252.769us     252.769us      65.983us         8.42%      65.983us      65.983us             1  
-                                    aten::empty_strided         2.47%      31.082us         2.47%      31.082us       5.180us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        17.12%     215.467us        17.12%     215.467us      35.911us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.61%      32.851us         3.32%      41.772us       3.481us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.71%       8.921us         0.71%       8.921us       0.743us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         3.53%      44.461us         3.53%      44.461us       7.410us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        32.97%     414.834us        32.97%     414.834us     414.834us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary        13.04%     159.984us        66.42%     814.800us     814.800us       0.000us         0.00%     847.705us     847.705us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     789.466us       101.01%     789.466us     789.466us             1  
+                                            aten::clone         1.84%      22.521us        42.98%     527.184us      87.864us       0.000us         0.00%     577.883us      96.314us             6  
+                                            aten::copy_         2.96%      36.311us        38.61%     473.681us      78.947us     511.772us        65.48%     577.883us      96.314us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us     511.772us        65.48%     511.772us      85.295us             6  
+                          _rotary_dba7d1e::apply_rotary         3.59%      44.023us         6.92%      84.943us      14.157us     269.822us        34.52%     269.822us      44.970us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us     269.822us        34.52%     269.822us      44.970us             6  
+                                Activity Buffer Request        21.07%     258.456us        21.07%     258.456us     258.456us      66.111us         8.46%      66.111us      66.111us             1  
+                                    aten::empty_strided         2.53%      30.982us         2.53%      30.982us       5.164us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        14.58%     178.914us        14.58%     178.914us      29.819us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.74%      33.620us         3.48%      42.689us       3.557us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.74%       9.069us         0.74%       9.069us       0.756us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         3.34%      40.920us         3.34%      40.920us       6.820us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        33.58%     411.910us        33.58%     411.910us     411.910us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.258ms
-Self CUDA time total: 783.478us
+Self CPU time total: 1.227ms
+Self CUDA time total: 781.594us
 
 
 impl                     wl                  p50(ms)  ok
 hf_kernels_rotary        cuda_B1_S128_H32_D128_R64     0.09  False
 hf_kernels_rotary        cuda_B1_S128_H32_D64_R32     0.09  False
-hf_kernels_rotary        cuda_B1_S128_H8_D128_R64     0.09  False
+hf_kernels_rotary        cuda_B1_S128_H8_D128_R64     0.10  False
 hf_kernels_rotary        cuda_B1_S128_H8_D64_R32     0.08  False
 hf_kernels_rotary        cuda_B1_S2048_H32_D128_R64     0.09  False
 hf_kernels_rotary        cuda_B1_S2048_H32_D64_R32     0.09  False
@@ -4635,13 +4635,12 @@ hf_kernels_rotary        cuda_B2_S512_H8_D64_R32     0.09  False
 <div class="uv-install-logs" id="uv-logs-benchmark">
 <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 <div class="uv-logs-content" style="display: none;">
-Installed 52 packages in 230ms
+Installed 52 packages in 233ms
 </div>
 </div>
 <div class="cell-stderr">Fetching 5 files:   0%|          | 0/5 [00:00&lt;?, ?it/s]
-Fetching 5 files:  20%|██        | 1/5 [00:00&lt;00:00,  7.39it/s]
-Fetching 5 files: 100%|██████████| 5/5 [00:00&lt;00:00, 16.59it/s]
-Fetching 5 files: 100%|██████████| 5/5 [00:00&lt;00:00, 15.43it/s]</div>
+Fetching 5 files: 100%|██████████| 5/5 [00:00&lt;00:00, 22.14it/s]
+Fetching 5 files: 100%|██████████| 5/5 [00:00&lt;00:00, 22.12it/s]</div>
 <div class="cell-artifacts">
 <h4>Artifacts:</h4>
 <a href="artifacts/benchmark/rotary.jsonl" class="artifact" target="_blank">rotary.jsonl</a>
diff --git a/rotary/impls/torch_rotary.html b/rotary/impls/torch_rotary.html
index f2e07316cf3df8891afa30950cda265901d2fcae..7606a093a65d04c40d580abf67d210368fd50dcd 100644
--- a/rotary/impls/torch_rotary.html
+++ b/rotary/impls/torch_rotary.html
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
 <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> | 
-Cell: nv | 0.23s
+Cell: nv | 0.20s
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
 <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3887,7 +3887,7 @@ Cell: nv | 0.23s
 </div>
 </div>
 <div id="output-nv" class="cell-output">
-<div class="cell-stdout"><pre class="stdout-text">Tue Oct 28 14:08:24 2025       
+<div class="cell-stdout"><pre class="stdout-text">Wed Oct 29 14:26:51 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -3896,7 +3896,7 @@ Cell: nv | 0.23s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   29C    P0             90W /  350W |       0MiB /  46068MiB |     24%      Default |
+| N/A   32C    P0             76W /  350W |       0MiB /  46068MiB |     11%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3920,7 +3920,7 @@ Cell: nv | 0.23s
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> | 
-Cell: benchmark | 3.87s
+Cell: benchmark | 3.84s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3999,27 +3999,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.099ms      1229.41%       1.099ms       1.099ms             1  
-                                            torch_eager        14.68%     402.893us        99.74%       2.737ms       2.737ms       0.000us         0.00%      90.654us      90.654us             1  
-                                              aten::mul         6.18%     169.712us        10.63%     291.789us      12.158us      46.975us        52.54%      46.975us       1.957us            24  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      46.975us        52.54%      46.975us       1.957us            24  
-                                            aten::copy_         5.12%     140.498us        62.48%       1.714ms      95.244us      29.151us        32.61%      30.399us       1.689us            18  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.400us        25.05%      22.400us       1.867us            12  
-void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.280us        14.85%      13.280us       1.107us            12  
-                                            aten::clone         1.37%      37.603us        60.57%       1.662ms     277.027us       0.000us         0.00%       7.999us       1.333us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.751us         7.55%       6.751us       1.125us             6  
-                                              aten::sub         1.57%      43.112us         2.52%      69.272us      11.545us       6.688us         7.48%       6.688us       1.115us             6  
-                                              aten::add         1.32%      36.261us         2.18%      59.731us       9.955us       6.592us         7.37%       6.592us       1.099us             6  
-                                Activity Buffer Request        52.27%       1.434ms        52.27%       1.434ms       1.434ms       1.248us         1.40%       1.248us       1.248us             1  
-                                    aten::empty_strided         2.02%      55.541us         2.02%      55.541us       9.257us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         2.66%      72.862us         2.66%      72.862us      12.144us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.02%      82.803us         3.84%     105.504us       4.396us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.83%      22.701us         0.83%      22.701us       0.946us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.69%     238.340us         8.69%     238.340us       4.965us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.26%       7.250us         0.26%       7.250us       7.250us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.124ms      1261.56%       1.124ms       1.124ms             1  
+                                            torch_eager        14.73%     412.767us        99.72%       2.794ms       2.794ms       0.000us         0.00%      90.337us      90.337us             1  
+                                              aten::mul         6.25%     175.043us        11.07%     310.105us      12.921us      46.912us        52.64%      46.912us       1.955us            24  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      46.912us        52.64%      46.912us       1.955us            24  
+                                            aten::copy_         4.12%     115.463us        61.76%       1.730ms      96.132us      28.993us        32.53%      30.210us       1.678us            18  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.368us        25.10%      22.368us       1.864us            12  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.215us        14.83%      13.215us       1.101us            12  
+                                            aten::clone         1.31%      36.692us        59.66%       1.671ms     278.565us       0.000us         0.00%       7.842us       1.307us             6  
+                                              aten::sub         1.68%      47.063us         2.72%      76.213us      12.702us       6.655us         7.47%       6.655us       1.109us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.625us         7.43%       6.625us       1.104us             6  
+                                              aten::add         1.39%      39.044us         2.34%      65.583us      10.930us       6.560us         7.36%       6.560us       1.093us             6  
+                                Activity Buffer Request        52.45%       1.470ms        52.45%       1.470ms       1.470ms       1.217us         1.37%       1.217us       1.217us             1  
+                                    aten::empty_strided         1.99%      55.621us         1.99%      55.621us       9.270us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         2.66%      74.431us         2.66%      74.431us      12.405us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.98%      83.492us         3.80%     106.494us       4.437us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.82%      23.002us         0.82%      23.002us       0.958us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         9.34%     261.675us         9.34%     261.675us       5.452us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.28%       7.890us         0.28%       7.890us       7.890us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.744ms
-Self CUDA time total: 89.406us
+Self CPU time total: 2.802ms
+Self CUDA time total: 89.120us
 
 
 
@@ -4029,27 +4029,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.001ms      1104.88%       1.001ms       1.001ms             1  
-                                            torch_eager        13.31%     340.683us        99.79%       2.555ms       2.555ms       0.000us         0.00%      91.680us      91.680us             1  
-                                              aten::mul         6.04%     154.674us        10.48%     268.377us      11.182us      47.810us        52.79%      47.810us       1.992us            24  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      47.810us        52.79%      47.810us       1.992us            24  
-                                            aten::copy_         4.35%     111.424us        65.16%       1.668ms      92.682us      29.407us        32.47%      30.527us       1.696us            18  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.559us        24.91%      22.559us       1.880us            12  
-void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.343us        14.73%      13.343us       1.112us            12  
-                                            aten::clone         1.08%      27.742us        62.03%       1.588ms     264.676us       0.000us         0.00%       7.968us       1.328us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.848us         7.56%       6.848us       1.141us             6  
-                                              aten::sub         1.52%      38.791us         2.50%      64.042us      10.674us       6.720us         7.42%       6.720us       1.120us             6  
-                                              aten::add         1.27%      32.413us         2.18%      55.903us       9.317us       6.623us         7.31%       6.623us       1.104us             6  
-                                Activity Buffer Request        56.03%       1.434ms        56.03%       1.434ms       1.434ms       1.120us         1.24%       1.120us       1.120us             1  
-                                    aten::empty_strided         1.42%      36.451us         1.42%      36.451us       6.075us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         2.10%      53.872us         2.10%      53.872us       8.979us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.86%      73.182us         3.65%      93.342us       3.889us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.79%      20.160us         0.79%      20.160us       0.840us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         9.02%     231.028us         9.02%     231.028us       4.813us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.21%       5.420us         0.21%       5.420us       5.420us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     968.092us      1071.28%     968.092us     968.092us             1  
+                                            torch_eager        12.50%     317.076us        99.79%       2.532ms       2.532ms       0.000us         0.00%      91.488us      91.488us             1  
+                                              aten::mul         6.07%     153.959us        10.35%     262.528us      10.939us      47.648us        52.73%      47.648us       1.985us            24  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      47.648us        52.73%      47.648us       1.985us            24  
+                                            aten::copy_         4.16%     105.603us        65.14%       1.653ms      91.828us      29.344us        32.47%      30.464us       1.692us            18  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.592us        25.00%      22.592us       1.883us            12  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.376us        14.80%      13.376us       1.115us            12  
+                                            aten::clone         1.12%      28.391us        62.74%       1.592ms     265.351us       0.000us         0.00%       7.872us       1.312us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.752us         7.47%       6.752us       1.125us             6  
+                                              aten::sub         1.55%      39.261us         2.49%      63.132us      10.522us       6.688us         7.40%       6.688us       1.115us             6  
+                                              aten::add         1.47%      37.180us         2.35%      59.741us       9.957us       6.688us         7.40%       6.688us       1.115us             6  
+                                Activity Buffer Request        56.17%       1.425ms        56.17%       1.425ms       1.425ms       1.120us         1.24%       1.120us       1.120us             1  
+                                    aten::empty_strided         2.04%      51.662us         2.04%      51.662us       8.610us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         2.12%      53.792us         2.12%      53.792us       8.965us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.04%      77.153us         3.82%      96.932us       4.039us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.78%      19.779us         0.78%      19.779us       0.824us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.79%     223.101us         8.79%     223.101us       4.648us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.21%       5.210us         0.21%       5.210us       5.210us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.560ms
-Self CUDA time total: 90.560us
+Self CPU time total: 2.538ms
+Self CUDA time total: 90.368us
 
 
 
@@ -4059,27 +4059,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     944.341us      1003.42%     944.341us     944.341us             1  
-                                            torch_eager        12.66%     316.554us        99.80%       2.495ms       2.495ms       0.000us         0.00%      95.424us      95.424us             1  
-                                              aten::mul         6.01%     150.161us        10.40%     259.987us      10.833us      48.863us        51.92%      48.863us       2.036us            24  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      48.863us        51.92%      48.863us       2.036us            24  
-                                            aten::copy_         4.06%     101.511us        66.21%       1.655ms      91.941us      30.785us        32.71%      32.097us       1.783us            18  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      23.009us        24.45%      23.009us       1.917us            12  
-void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.464us        15.37%      14.464us       1.205us            12  
-                                            aten::clone         1.08%      26.971us        63.11%       1.577ms     262.904us       0.000us         0.00%       9.088us       1.515us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.776us         8.26%       7.776us       1.296us             6  
-                                              aten::add         1.43%      35.631us         2.33%      58.151us       9.692us       7.233us         7.69%       7.233us       1.205us             6  
-                                              aten::sub         1.42%      35.432us         2.34%      58.413us       9.736us       7.231us         7.68%       7.231us       1.205us             6  
-                                Activity Buffer Request        57.41%       1.435ms        57.41%       1.435ms       1.435ms       1.312us         1.39%       1.312us       1.312us             1  
-                                    aten::empty_strided         1.23%      30.860us         1.23%      30.860us       5.143us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         2.03%      50.692us         2.03%      50.692us       8.449us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.76%      69.107us         3.55%      88.725us       3.697us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.78%      19.618us         0.78%      19.618us       0.817us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.92%     222.961us         8.92%     222.961us       4.645us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.20%       5.071us         0.20%       5.071us       5.071us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.007ms      1071.77%       1.007ms       1.007ms             1  
+                                            torch_eager        12.81%     333.813us        99.77%       2.600ms       2.600ms       0.000us         0.00%      95.234us      95.234us             1  
+                                              aten::mul         6.17%     160.752us        10.75%     280.063us      11.669us      48.706us        51.86%      48.706us       2.029us            24  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      48.706us        51.86%      48.706us       2.029us            24  
+                                            aten::copy_         4.30%     112.081us        64.85%       1.690ms      93.891us      30.753us        32.74%      32.065us       1.781us            18  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      23.009us        24.50%      23.009us       1.917us            12  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.463us        15.40%      14.463us       1.205us            12  
+                                            aten::clone         1.08%      28.070us        62.18%       1.621ms     270.093us       0.000us         0.00%       9.056us       1.509us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.744us         8.25%       7.744us       1.291us             6  
+                                              aten::sub         1.50%      39.201us         2.50%      65.063us      10.844us       7.263us         7.73%       7.263us       1.211us             6  
+                                              aten::add         1.40%      36.592us         2.30%      59.882us       9.980us       7.200us         7.67%       7.200us       1.200us             6  
+                                Activity Buffer Request        55.61%       1.449ms        55.61%       1.449ms       1.449ms       1.312us         1.40%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.87%      48.773us         1.87%      48.773us       8.129us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         2.21%      57.593us         2.21%      57.593us       9.599us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.85%      74.230us         3.62%      94.450us       3.935us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.78%      20.220us         0.78%      20.220us       0.842us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         9.19%     239.464us         9.19%     239.464us       4.989us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.23%       5.970us         0.23%       5.970us       5.970us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.500ms
-Self CUDA time total: 94.112us
+Self CPU time total: 2.606ms
+Self CUDA time total: 93.922us
 
 
 
@@ -4089,27 +4089,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     949.272us       934.99%     949.272us     949.272us             1  
-                                            torch_eager        11.74%     319.184us        99.83%       2.715ms       2.715ms       0.000us         0.00%     102.839us     102.839us             1  
-                                              aten::mul         5.42%     147.290us         9.69%     263.662us      10.986us      53.022us        52.22%      53.022us       2.209us            24  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      53.022us        52.22%      53.022us       2.209us            24  
-                                            aten::copy_         3.75%     101.924us        68.58%       1.865ms     103.635us      32.444us        31.96%      33.755us       1.875us            18  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.637us        24.27%      24.637us       2.053us            12  
-void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.062us        15.82%      16.062us       1.339us            12  
-                                            aten::clone         1.13%      30.729us        66.03%       1.796ms     299.314us       0.000us         0.00%       9.118us       1.520us             6  
-                                              aten::add         1.18%      32.140us         2.02%      54.851us       9.142us       8.032us         7.91%       8.032us       1.339us             6  
-                                              aten::sub         1.29%      35.030us         2.16%      58.621us       9.770us       8.030us         7.91%       8.030us       1.338us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.807us         7.69%       7.807us       1.301us             6  
-                                Activity Buffer Request        53.21%       1.447ms        53.21%       1.447ms       1.447ms       1.311us         1.29%       1.311us       1.311us             1  
-                                    aten::empty_strided         1.17%      31.801us         1.17%      31.801us       5.300us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         9.34%     254.009us         9.34%     254.009us      42.335us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.60%      70.842us         3.35%      90.984us       3.791us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.74%      20.142us         0.74%      20.142us       0.839us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.27%     224.985us         8.27%     224.985us       4.687us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.17%       4.671us         0.17%       4.671us       4.671us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     976.889us       967.02%     976.889us     976.889us             1  
+                                            torch_eager        12.01%     329.416us        99.82%       2.739ms       2.739ms       0.000us         0.00%     102.333us     102.333us             1  
+                                              aten::mul         5.67%     155.545us         9.73%     266.927us      11.122us      52.800us        52.27%      52.800us       2.200us            24  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      52.800us        52.27%      52.800us       2.200us            24  
+                                            aten::copy_         3.82%     104.765us        68.18%       1.871ms     103.922us      32.349us        32.02%      33.661us       1.870us            18  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.574us        24.33%      24.574us       2.048us            12  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      15.872us        15.71%      15.872us       1.323us            12  
+                                            aten::clone         1.07%      29.290us        65.23%       1.790ms     298.277us       0.000us         0.00%       9.087us       1.515us             6  
+                                              aten::sub         1.39%      38.150us         2.28%      62.431us      10.405us       7.936us         7.86%       7.936us       1.323us             6  
+                                              aten::add         1.24%      34.113us         2.07%      56.743us       9.457us       7.936us         7.86%       7.936us       1.323us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.775us         7.70%       7.775us       1.296us             6  
+                                Activity Buffer Request        52.33%       1.436ms        52.33%       1.436ms       1.436ms       1.312us         1.30%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.16%      31.821us         1.16%      31.821us       5.304us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         9.42%     258.335us         9.42%     258.335us      43.056us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.63%      72.071us         3.33%      91.411us       3.809us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.70%      19.340us         0.70%      19.340us       0.806us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.39%     230.176us         8.39%     230.176us       4.795us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.18%       5.010us         0.18%       5.010us       5.010us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.720ms
-Self CUDA time total: 101.528us
+Self CPU time total: 2.744ms
+Self CUDA time total: 101.021us
 
 
 
@@ -4119,27 +4119,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     944.887us      1005.38%     944.887us     944.887us             1  
-                                            torch_eager        11.86%     320.838us        99.82%       2.700ms       2.700ms       0.000us         0.00%      95.295us      95.295us             1  
-                                              aten::mul         5.37%     145.335us         9.42%     254.837us      10.618us      49.024us        52.16%      49.024us       2.043us            24  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      49.024us        52.16%      49.024us       2.043us            24  
-                                            aten::copy_         3.87%     104.672us        68.80%       1.861ms     103.396us      30.783us        32.75%      32.095us       1.783us            18  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.912us        24.38%      22.912us       1.909us            12  
-void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.176us        15.08%      14.176us       1.181us            12  
-                                            aten::clone         1.07%      28.861us        66.14%       1.789ms     298.231us       0.000us         0.00%       9.183us       1.530us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.871us         8.37%       7.871us       1.312us             6  
-                                              aten::sub         1.26%      33.972us         2.12%      57.464us       9.577us       7.103us         7.56%       7.103us       1.184us             6  
-                                              aten::add         1.16%      31.253us         1.99%      53.964us       8.994us       7.073us         7.53%       7.073us       1.179us             6  
-                                Activity Buffer Request        53.80%       1.456ms        53.80%       1.456ms       1.456ms       1.312us         1.40%       1.312us       1.312us             1  
-                                    aten::empty_strided         1.17%      31.633us         1.17%      31.633us       5.272us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         8.82%     238.648us         8.82%     238.648us      39.775us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.67%      72.119us         3.38%      91.532us       3.814us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.72%      19.413us         0.72%      19.413us       0.809us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.06%     217.970us         8.06%     217.970us       4.541us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.18%       4.990us         0.18%       4.990us       4.990us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     972.954us      1035.95%     972.954us     972.954us             1  
+                                            torch_eager        11.82%     323.628us        99.83%       2.734ms       2.734ms       0.000us         0.00%      95.231us      95.231us             1  
+                                              aten::mul         5.48%     150.092us         9.71%     265.906us      11.079us      48.958us        52.13%      48.958us       2.040us            24  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      48.958us        52.13%      48.958us       2.040us            24  
+                                            aten::copy_         4.01%     109.805us        68.55%       1.878ms     104.307us      30.784us        32.78%      32.096us       1.783us            18  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.912us        24.40%      22.912us       1.909us            12  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.177us        15.09%      14.177us       1.181us            12  
+                                            aten::clone         0.98%      26.740us        65.50%       1.794ms     299.012us       0.000us         0.00%       9.184us       1.531us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.872us         8.38%       7.872us       1.312us             6  
+                                              aten::sub         1.35%      37.100us         2.22%      60.781us      10.130us       7.106us         7.57%       7.106us       1.184us             6  
+                                              aten::add         1.26%      34.471us         2.07%      56.641us       9.440us       7.071us         7.53%       7.071us       1.178us             6  
+                                Activity Buffer Request        53.28%       1.459ms        53.28%       1.459ms       1.459ms       1.312us         1.40%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.12%      30.591us         1.12%      30.591us       5.098us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.84%     242.034us         8.84%     242.034us      40.339us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.64%      72.284us         3.37%      92.363us       3.848us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.73%      20.079us         0.73%      20.079us       0.837us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.33%     228.067us         8.33%     228.067us       4.751us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.17%       4.701us         0.17%       4.701us       4.701us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.705ms
-Self CUDA time total: 93.983us
+Self CPU time total: 2.739ms
+Self CUDA time total: 93.919us
 
 
 
@@ -4149,27 +4149,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     912.250us       902.45%     912.250us     912.250us             1  
-                                            torch_eager        10.84%     287.380us        99.80%       2.646ms       2.646ms       0.000us         0.00%     102.398us     102.398us             1  
-                                              aten::mul         5.43%     143.901us         9.61%     254.716us      10.613us      52.767us        52.20%      52.767us       2.199us            24  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      52.767us        52.20%      52.767us       2.199us            24  
-                                            aten::copy_         3.82%     101.373us        69.76%       1.849ms     102.733us      32.416us        32.07%      33.728us       1.874us            18  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.608us        24.34%      24.608us       2.051us            12  
-void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      15.903us        15.73%      15.903us       1.325us            12  
-                                            aten::clone         0.89%      23.520us        66.94%       1.774ms     295.745us       0.000us         0.00%       9.120us       1.520us             6  
-                                              aten::add         1.25%      33.223us         2.12%      56.323us       9.387us       7.968us         7.88%       7.968us       1.328us             6  
-                                              aten::sub         1.34%      35.391us         2.21%      58.453us       9.742us       7.935us         7.85%       7.935us       1.322us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.808us         7.72%       7.808us       1.301us             6  
-                                Activity Buffer Request        54.59%       1.447ms        54.59%       1.447ms       1.447ms       1.312us         1.30%       1.312us       1.312us             1  
-                                    aten::empty_strided         1.14%      30.292us         1.14%      30.292us       5.049us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         9.04%     239.538us         9.04%     239.538us      39.923us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.52%      66.730us         3.23%      85.664us       3.569us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.71%      18.934us         0.71%      18.934us       0.789us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.23%     218.091us         8.23%     218.091us       4.544us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.20%       5.360us         0.20%       5.360us       5.360us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     940.506us       929.78%     940.506us     940.506us             1  
+                                            torch_eager        10.47%     280.203us        99.80%       2.672ms       2.672ms       0.000us         0.00%     102.466us     102.466us             1  
+                                              aten::mul         5.68%     151.942us         9.93%     265.874us      11.078us      52.767us        52.17%      52.767us       2.199us            24  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      52.767us        52.17%      52.767us       2.199us            24  
+                                            aten::copy_         3.99%     106.699us        69.68%       1.866ms     103.641us      32.384us        32.01%      33.696us       1.872us            18  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.672us        24.39%      24.672us       2.056us            12  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.003us        15.82%      16.003us       1.334us            12  
+                                            aten::clone         0.80%      21.540us        66.42%       1.778ms     296.379us       0.000us         0.00%       9.024us       1.504us             6  
+                                              aten::sub         1.42%      38.052us         2.40%      64.133us      10.689us       8.002us         7.91%       8.002us       1.334us             6  
+                                              aten::add         1.23%      32.860us         2.10%      56.182us       9.364us       8.001us         7.91%       8.001us       1.333us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.712us         7.62%       7.712us       1.285us             6  
+                                Activity Buffer Request        54.45%       1.458ms        54.45%       1.458ms       1.458ms       1.312us         1.30%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.14%      30.450us         1.14%      30.450us       5.075us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.74%     234.006us         8.74%     234.006us      39.001us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.58%      69.109us         3.28%      87.850us       3.660us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.70%      18.741us         0.70%      18.741us       0.781us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.61%     230.527us         8.61%     230.527us       4.803us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.20%       5.400us         0.20%       5.400us       5.400us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.651ms
-Self CUDA time total: 101.086us
+Self CPU time total: 2.677ms
+Self CUDA time total: 101.154us
 
 
 
@@ -4179,27 +4179,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     920.762us       761.21%     920.762us     920.762us             1  
-                                            torch_eager        10.74%     283.666us        99.80%       2.636ms       2.636ms       0.000us         0.00%     122.785us     122.785us             1  
-                                              aten::mul         5.61%     148.102us         9.80%     258.888us      10.787us      62.177us        51.40%      62.177us       2.591us            24  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      62.177us        51.40%      62.177us       2.591us            24  
-                                            aten::copy_         4.01%     105.842us        69.73%       1.842ms     102.324us      39.520us        32.67%      41.344us       2.297us            18  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.896us        23.89%      28.896us       2.408us            12  
-void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.264us        15.93%      19.264us       1.605us            12  
-                                            aten::clone         0.81%      21.319us        66.69%       1.761ms     293.582us       0.000us         0.00%      12.448us       2.075us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.624us         8.78%      10.624us       1.771us             6  
-                                              aten::add         1.23%      32.431us         2.08%      54.912us       9.152us       9.696us         8.02%       9.696us       1.616us             6  
-                                              aten::sub         1.34%      35.510us         2.24%      59.050us       9.842us       9.568us         7.91%       9.568us       1.595us             6  
-                                Activity Buffer Request        54.62%       1.443ms        54.62%       1.443ms       1.443ms       1.824us         1.51%       1.824us       1.824us             1  
-                                    aten::empty_strided         1.13%      29.871us         1.13%      29.871us       4.979us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         8.76%     231.329us         8.76%     231.329us      38.555us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.53%      66.872us         3.28%      86.661us       3.611us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.75%      19.789us         0.75%      19.789us       0.825us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.28%     218.631us         8.28%     218.631us       4.555us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.20%       5.190us         0.20%       5.190us       5.190us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.015ms       844.44%       1.015ms       1.015ms             1  
+                                            torch_eager        10.99%     299.529us        99.80%       2.720ms       2.720ms       0.000us         0.00%     122.045us     122.045us             1  
+                                              aten::mul         5.97%     162.734us        10.28%     280.227us      11.676us      61.856us        51.45%      61.856us       2.577us            24  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      61.856us        51.45%      61.856us       2.577us            24  
+                                            aten::copy_         4.97%     135.364us        68.63%       1.870ms     103.912us      39.199us        32.61%      41.023us       2.279us            18  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.704us        23.88%      28.704us       2.392us            12  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.166us        15.94%      19.166us       1.597us            12  
+                                            aten::clone         0.84%      22.992us        64.39%       1.755ms     292.512us       0.000us         0.00%      12.319us       2.053us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.495us         8.73%      10.495us       1.749us             6  
+                                              aten::add         1.19%      32.530us         2.08%      56.691us       9.448us       9.598us         7.98%       9.598us       1.600us             6  
+                                              aten::sub         1.40%      38.111us         2.30%      62.811us      10.468us       9.568us         7.96%       9.568us       1.595us             6  
+                                Activity Buffer Request        52.53%       1.432ms        52.53%       1.432ms       1.432ms       1.824us         1.52%       1.824us       1.824us             1  
+                                    aten::empty_strided         1.18%      32.290us         1.18%      32.290us       5.382us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.53%     232.585us         8.53%     232.585us      38.764us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.71%      73.938us         3.49%      95.000us       3.958us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.77%      21.062us         0.77%      21.062us       0.878us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.70%     237.086us         8.70%     237.086us       4.939us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.20%       5.570us         0.20%       5.570us       5.570us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.641ms
-Self CUDA time total: 120.961us
+Self CPU time total: 2.726ms
+Self CUDA time total: 120.221us
 
 
 
@@ -4209,27 +4209,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     939.640us       544.89%     939.640us     939.640us             1  
-                                            torch_eager        12.08%     323.576us        99.81%       2.674ms       2.674ms       0.000us         0.00%     175.325us     175.325us             1  
-                                              aten::mul         5.49%     147.107us         9.55%     255.901us      10.663us      89.504us        51.90%      89.504us       3.729us            24  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      89.504us        51.90%      89.504us       3.729us            24  
-                                            aten::copy_         3.83%     102.724us        68.48%       1.835ms     101.930us      57.918us        33.59%      60.798us       3.378us            18  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.734us        23.62%      40.734us       3.395us            12  
-void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      25.023us        14.51%      25.023us       2.085us            12  
-                                            aten::clone         1.06%      28.292us        65.67%       1.760ms     293.252us       0.000us         0.00%      20.064us       3.344us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.184us         9.96%      17.184us       2.864us             6  
-                                              aten::add         1.22%      32.572us         2.05%      54.872us       9.145us      12.512us         7.26%      12.512us       2.085us             6  
-                                              aten::sub         1.28%      34.403us         2.15%      57.513us       9.586us      12.511us         7.26%      12.511us       2.085us             6  
-                                Activity Buffer Request        53.69%       1.438ms        53.69%       1.438ms       1.438ms       2.880us         1.67%       2.880us       2.880us             1  
-                                    aten::empty_strided         1.12%      30.100us         1.12%      30.100us       5.017us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         8.57%     229.599us         8.57%     229.599us      38.267us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.59%      69.394us         3.32%      89.005us       3.709us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.73%      19.611us         0.73%      19.611us       0.817us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.14%     218.155us         8.14%     218.155us       4.545us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.19%       5.191us         0.19%       5.191us       5.191us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     951.101us       552.87%     951.101us     951.101us             1  
+                                            torch_eager        11.67%     313.772us        99.81%       2.683ms       2.683ms       0.000us         0.00%     174.878us     174.878us             1  
+                                              aten::mul         5.73%     154.081us         9.89%     265.836us      11.076us      89.599us        52.08%      89.599us       3.733us            24  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      89.599us        52.08%      89.599us       3.733us            24  
+                                            aten::copy_         3.89%     104.453us        68.40%       1.838ms     102.128us      57.664us        33.52%      60.512us       3.362us            18  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.832us        23.74%      40.832us       3.403us            12  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.767us        14.40%      24.767us       2.064us            12  
+                                            aten::clone         1.01%      27.120us        65.39%       1.758ms     292.937us       0.000us         0.00%      19.680us       3.280us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.832us         9.78%      16.832us       2.805us             6  
+                                              aten::add         1.27%      34.231us         2.14%      57.531us       9.588us      12.416us         7.22%      12.416us       2.069us             6  
+                                              aten::sub         1.34%      36.001us         2.22%      59.581us       9.930us      12.351us         7.18%      12.351us       2.059us             6  
+                                Activity Buffer Request        53.45%       1.437ms        53.45%       1.437ms       1.437ms       2.848us         1.66%       2.848us       2.848us             1  
+                                    aten::empty_strided         1.13%      30.290us         1.13%      30.290us       5.048us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.55%     229.865us         8.55%     229.865us      38.311us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.63%      70.721us         3.36%      90.322us       3.763us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.73%      19.601us         0.73%      19.601us       0.817us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.41%     225.976us         8.41%     225.976us       4.708us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.19%       5.001us         0.19%       5.001us       5.001us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.679ms
-Self CUDA time total: 172.445us
+Self CPU time total: 2.688ms
+Self CUDA time total: 172.030us
 
 
 
@@ -4239,27 +4239,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     910.515us       751.54%     910.515us     910.515us             1  
-                                            torch_eager        19.90%     282.972us        99.65%       1.417ms       1.417ms       0.000us         0.00%     123.009us     123.009us             1  
-                                              aten::mul        10.25%     145.781us        17.92%     254.851us      10.619us      62.146us        51.30%      62.146us       2.589us            24  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      62.146us        51.30%      62.146us       2.589us            24  
-                                            aten::copy_         7.07%     100.509us        44.20%     628.439us      34.913us      39.743us        32.80%      41.599us       2.311us            18  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      29.055us        23.98%      29.055us       2.421us            12  
-void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.264us        15.90%      19.264us       1.605us            12  
-                                            aten::clone         1.59%      22.604us        38.82%     551.881us      91.980us       0.000us         0.00%      12.544us       2.091us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.688us         8.82%      10.688us       1.781us             6  
-                                              aten::add         2.23%      31.661us         3.79%      53.922us       8.987us       9.633us         7.95%       9.633us       1.606us             6  
-                                              aten::sub         2.49%      35.352us         4.13%      58.732us       9.789us       9.631us         7.95%       9.631us       1.605us             6  
-                                Activity Buffer Request        16.91%     240.489us        16.91%     240.489us     240.489us       1.856us         1.53%       1.856us       1.856us             1  
-                                    aten::empty_strided         2.06%      29.230us         2.06%      29.230us       4.872us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        15.93%     226.498us        15.93%     226.498us      37.750us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.75%      67.473us         6.05%      86.070us       3.586us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.31%      18.597us         1.31%      18.597us       0.775us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        15.17%     215.654us        15.17%     215.654us       4.493us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.35%       4.980us         0.35%       4.980us       4.980us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     927.996us       768.63%     927.996us     927.996us             1  
+                                            torch_eager        20.13%     284.369us        99.65%       1.408ms       1.408ms       0.000us         0.00%     122.557us     122.557us             1  
+                                              aten::mul        10.77%     152.163us        18.72%     264.405us      11.017us      62.048us        51.39%      62.048us       2.585us            24  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      62.048us        51.39%      62.048us       2.585us            24  
+                                            aten::copy_         7.56%     106.823us        43.43%     613.475us      34.082us      39.390us        32.63%      41.213us       2.290us            18  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.864us        23.91%      28.864us       2.405us            12  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.296us        15.98%      19.296us       1.608us            12  
+                                            aten::clone         1.39%      19.620us        37.04%     523.281us      87.213us       0.000us         0.00%      12.349us       2.058us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.526us         8.72%      10.526us       1.754us             6  
+                                              aten::add         2.28%      32.232us         3.86%      54.523us       9.087us       9.696us         8.03%       9.696us       1.616us             6  
+                                              aten::sub         2.48%      35.082us         4.10%      57.982us       9.664us       9.600us         7.95%       9.600us       1.600us             6  
+                                Activity Buffer Request        14.96%     211.375us        14.96%     211.375us     211.375us       1.823us         1.51%       1.823us       1.823us             1  
+                                    aten::empty_strided         2.07%      29.290us         2.07%      29.290us       4.882us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        16.27%     229.815us        16.27%     229.815us      38.302us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.68%      66.168us         5.95%      84.051us       3.502us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.27%      17.883us         1.27%      17.883us       0.745us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.78%     222.895us        15.78%     222.895us       4.644us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.35%       4.970us         0.35%       4.970us       4.970us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.422ms
-Self CUDA time total: 121.153us
+Self CPU time total: 1.413ms
+Self CUDA time total: 120.734us
 
 
 
@@ -4269,27 +4269,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     918.443us       533.10%     918.443us     918.443us             1  
-                                            torch_eager        20.03%     279.953us        99.65%       1.393ms       1.393ms       0.000us         0.00%     175.133us     175.133us             1  
-                                              aten::mul        10.59%     147.997us        18.47%     258.229us      10.760us      89.472us        51.93%      89.472us       3.728us            24  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      89.472us        51.93%      89.472us       3.728us            24  
-                                            aten::copy_         7.43%     103.844us        43.15%     603.182us      33.510us      57.887us        33.60%      60.735us       3.374us            18  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.831us        23.70%      40.831us       3.403us            12  
-void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.926us        14.47%      24.926us       2.077us            12  
-                                            aten::clone         1.45%      20.289us        37.34%     521.998us      87.000us       0.000us         0.00%      19.904us       3.317us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.056us         9.90%      17.056us       2.843us             6  
-                                              aten::add         2.21%      30.953us         3.79%      53.002us       8.834us      12.480us         7.24%      12.480us       2.080us             6  
-                                              aten::sub         2.40%      33.491us         4.09%      57.142us       9.524us      12.446us         7.22%      12.446us       2.074us             6  
-                                Activity Buffer Request        14.98%     209.468us        14.98%     209.468us     209.468us       2.848us         1.65%       2.848us       2.848us             1  
-                                    aten::empty_strided         2.03%      28.380us         2.03%      28.380us       4.730us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        16.36%     228.728us        16.36%     228.728us      38.121us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         5.25%      73.370us         6.64%      92.881us       3.870us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.40%      19.511us         1.40%      19.511us       0.813us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        15.53%     217.074us        15.53%     217.074us       4.522us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.35%       4.950us         0.35%       4.950us       4.950us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     941.367us       547.21%     941.367us     941.367us             1  
+                                            torch_eager        19.36%     280.543us        99.66%       1.444ms       1.444ms       0.000us         0.00%     174.877us     174.877us             1  
+                                              aten::mul        10.67%     154.592us        18.48%     267.677us      11.153us      89.535us        52.05%      89.535us       3.731us            24  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      89.535us        52.05%      89.535us       3.731us            24  
+                                            aten::copy_         7.38%     106.934us        44.27%     641.329us      35.629us      57.694us        33.54%      60.542us       3.363us            18  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.701us        23.66%      40.701us       3.392us            12  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.800us        14.42%      24.800us       2.067us            12  
+                                            aten::clone         1.44%      20.830us        37.97%     550.103us      91.684us       0.000us         0.00%      19.841us       3.307us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.993us         9.88%      16.993us       2.832us             6  
+                                              aten::add         2.36%      34.121us         3.90%      56.522us       9.420us      12.448us         7.24%      12.448us       2.075us             6  
+                                              aten::sub         2.56%      37.161us         4.27%      61.881us      10.313us      12.352us         7.18%      12.352us       2.059us             6  
+                                Activity Buffer Request        16.20%     234.686us        16.20%     234.686us     234.686us       2.848us         1.66%       2.848us       2.848us             1  
+                                    aten::empty_strided         2.02%      29.270us         2.02%      29.270us       4.878us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        15.95%     231.027us        15.95%     231.027us      38.505us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.63%      67.091us         5.92%      85.764us       3.573us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.29%      18.673us         1.29%      18.673us       0.778us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.80%     228.888us        15.80%     228.888us       4.768us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.34%       4.980us         0.34%       4.980us       4.980us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.398ms
-Self CUDA time total: 172.285us
+Self CPU time total: 1.449ms
+Self CUDA time total: 172.029us
 
 
 
@@ -4299,27 +4299,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     945.822us       332.63%     945.822us     945.822us             1  
-                                            torch_eager        11.69%     314.391us        99.81%       2.685ms       2.685ms       0.000us         0.00%     302.941us     302.941us             1  
-                                              aten::mul         5.41%     145.454us         9.45%     254.127us      10.589us     133.310us        46.88%     133.310us       5.555us            24  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     133.310us        46.88%     133.310us       5.555us            24  
-                                            aten::copy_         4.13%     111.027us        68.93%       1.854ms     103.002us     109.662us        38.57%     128.254us       7.125us            18  
-                                            aten::clone         1.07%      28.661us        65.93%       1.773ms     295.570us       0.000us         0.00%      70.912us      11.819us             6  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.342us        20.17%      57.342us       4.779us            12  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      52.320us        18.40%      52.320us       8.720us             6  
-void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      41.377us        14.55%      41.377us       3.448us            12  
-                                              aten::sub         1.27%      34.091us         2.15%      57.911us       9.652us      20.704us         7.28%      20.704us       3.451us             6  
-                                              aten::add         1.22%      32.950us         2.07%      55.610us       9.268us      20.673us         7.27%      20.673us       3.446us             6  
-                                Activity Buffer Request        54.12%       1.456ms        54.12%       1.456ms       1.456ms      18.592us         6.54%      18.592us      18.592us             1  
-                                    aten::empty_strided         1.18%      31.741us         1.18%      31.741us       5.290us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         8.32%     223.797us         8.32%     223.797us      37.300us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.55%      68.485us         3.28%      88.267us       3.678us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.74%      19.782us         0.74%      19.782us       0.824us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.13%     218.664us         8.13%     218.664us       4.555us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.19%       5.100us         0.19%       5.100us       5.100us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     950.141us       334.64%     950.141us     950.141us             1  
+                                            torch_eager        11.47%     310.562us        99.82%       2.702ms       2.702ms       0.000us         0.00%     302.012us     302.012us             1  
+                                              aten::mul         5.57%     150.802us         9.64%     260.955us      10.873us     133.822us        47.13%     133.822us       5.576us            24  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     133.822us        47.13%     133.822us       5.576us            24  
+                                            aten::copy_         3.88%     105.155us        69.00%       1.868ms     103.782us     109.151us        38.44%     127.231us       7.068us            18  
+                                            aten::clone         0.99%      26.749us        66.03%       1.788ms     297.926us       0.000us         0.00%      69.886us      11.648us             6  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.345us        20.20%      57.345us       4.779us            12  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      51.806us        18.25%      51.806us       8.634us             6  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      40.959us        14.43%      40.959us       3.413us            12  
+                                              aten::sub         1.29%      34.831us         2.15%      58.172us       9.695us      20.607us         7.26%      20.607us       3.435us             6  
+                                              aten::add         1.26%      34.242us         2.11%      57.104us       9.517us      20.352us         7.17%      20.352us       3.392us             6  
+                                Activity Buffer Request        54.34%       1.471ms        54.34%       1.471ms       1.471ms      18.080us         6.37%      18.080us      18.080us             1  
+                                    aten::empty_strided         1.13%      30.492us         1.13%      30.492us       5.082us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.33%     225.535us         8.33%     225.535us      37.589us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.63%      71.143us         3.33%      90.164us       3.757us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.70%      19.021us         0.70%      19.021us       0.793us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.22%     222.598us         8.22%     222.598us       4.637us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.18%       4.920us         0.18%       4.920us       4.920us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.690ms
-Self CUDA time total: 284.349us
+Self CPU time total: 2.707ms
+Self CUDA time total: 283.932us
 
 
 
@@ -4329,27 +4329,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     938.033us       165.64%     938.033us     938.033us             1  
-                                            torch_eager        20.89%     291.484us        99.63%       1.390ms       1.390ms       0.000us         0.00%     590.004us     590.004us             1  
-                                            aten::copy_         7.34%     102.395us        41.53%     579.320us      32.184us     273.370us        48.27%     297.081us      16.504us            18  
-                                              aten::mul        10.73%     149.623us        18.75%     261.638us      10.902us     225.916us        39.89%     225.916us       9.413us            24  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     225.916us        39.89%     225.916us       9.413us            24  
-                                            aten::clone         1.46%      20.369us        35.71%     498.147us      83.025us       0.000us         0.00%     206.459us      34.410us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us     182.748us        32.27%     182.748us      30.458us             6  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      90.622us        16.00%      90.622us       7.552us            12  
-void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      67.007us        11.83%      67.007us       5.584us            12  
-                                              aten::sub         2.52%      35.222us         4.78%      66.682us      11.114us      34.272us         6.05%      34.272us       5.712us             6  
-                                              aten::add         2.30%      32.121us         4.02%      56.063us       9.344us      32.735us         5.78%      32.735us       5.456us             6  
-                                Activity Buffer Request        14.16%     197.506us        14.16%     197.506us     197.506us      23.711us         4.19%      23.711us      23.711us             1  
-                                    aten::empty_strided         2.10%      29.332us         2.10%      29.332us       4.889us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        15.61%     217.828us        15.61%     217.828us      36.305us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.72%      65.792us         6.10%      85.041us       3.543us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.38%      19.249us         1.38%      19.249us       0.802us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        16.42%     229.008us        16.42%     229.008us       4.771us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.37%       5.150us         0.37%       5.150us       5.150us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     966.098us       169.87%     966.098us     966.098us             1  
+                                            torch_eager        20.40%     290.715us        99.64%       1.420ms       1.420ms       0.000us         0.00%     592.377us     592.377us             1  
+                                            aten::copy_         7.41%     105.615us        41.73%     594.574us      33.032us     275.293us        48.40%     298.941us      16.608us            18  
+                                              aten::mul        10.90%     155.244us        18.92%     269.648us      11.235us     227.071us        39.93%     227.071us       9.461us            24  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     227.071us        39.93%     227.071us       9.461us            24  
+                                            aten::clone         1.44%      20.483us        35.30%     502.923us      83.821us       0.000us         0.00%     207.134us      34.522us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us     183.486us        32.26%     183.486us      30.581us             6  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      91.807us        16.14%      91.807us       7.651us            12  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      66.365us        11.67%      66.365us       5.530us            12  
+                                              aten::sub         2.66%      37.929us         4.43%      63.131us      10.522us      33.790us         5.94%      33.790us       5.632us             6  
+                                              aten::add         2.47%      35.251us         4.15%      59.172us       9.862us      32.575us         5.73%      32.575us       5.429us             6  
+                                Activity Buffer Request        13.81%     196.814us        13.81%     196.814us     196.814us      23.648us         4.16%      23.648us      23.648us             1  
+                                    aten::empty_strided         2.02%      28.790us         2.02%      28.790us       4.798us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        15.63%     222.685us        15.63%     222.685us      37.114us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         5.20%      74.092us         6.55%      93.282us       3.887us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.35%      19.190us         1.35%      19.190us       0.800us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        16.35%     232.987us        16.35%     232.987us       4.854us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.36%       5.080us         0.36%       5.080us       5.080us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.395ms
-Self CUDA time total: 566.293us
+Self CPU time total: 1.425ms
+Self CUDA time total: 568.729us
 
 
 
@@ -4359,27 +4359,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     912.211us       984.01%     912.211us     912.211us             1  
-                                            torch_eager        20.74%     286.708us        99.62%       1.377ms       1.377ms       0.000us         0.00%      93.855us      93.855us             1  
-                                              aten::mul        10.48%     144.890us        18.31%     253.080us      10.545us      49.856us        53.78%      49.856us       2.077us            24  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      49.856us        53.78%      49.856us       2.077us            24  
-                                            aten::copy_         7.33%     101.333us        42.51%     587.542us      32.641us      29.407us        31.72%      30.559us       1.698us            18  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.623us        24.40%      22.623us       1.885us            12  
-void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.440us        14.50%      13.440us       1.120us            12  
-                                            aten::clone         1.54%      21.251us        36.76%     508.068us      84.678us       0.000us         0.00%       7.936us       1.323us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.784us         7.32%       6.784us       1.131us             6  
-                                              aten::sub         2.53%      34.908us         4.26%      58.910us       9.818us       6.720us         7.25%       6.720us       1.120us             6  
-                                              aten::add         2.34%      32.341us         3.97%      54.832us       9.139us       6.720us         7.25%       6.720us       1.120us             6  
-                                Activity Buffer Request        14.89%     205.787us        14.89%     205.787us     205.787us       1.152us         1.24%       1.152us       1.152us             1  
-                                    aten::empty_strided         2.09%      28.901us         2.09%      28.901us       4.817us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        15.89%     219.618us        15.89%     219.618us      36.603us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.84%      66.885us         6.21%      85.845us       3.577us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.37%      18.960us         1.37%      18.960us       0.790us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        15.59%     215.487us        15.59%     215.487us       4.489us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.38%       5.210us         0.38%       5.210us       5.210us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     975.032us      1053.20%     975.032us     975.032us             1  
+                                            torch_eager        19.78%     289.798us        99.66%       1.460ms       1.460ms       0.000us         0.00%      93.698us      93.698us             1  
+                                              aten::mul        11.08%     162.260us        19.21%     281.475us      11.728us      49.665us        53.65%      49.665us       2.069us            24  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      49.665us        53.65%      49.665us       2.069us            24  
+                                            aten::copy_         7.16%     104.830us        42.02%     615.673us      34.204us      29.441us        31.80%      30.561us       1.698us            18  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.657us        24.47%      22.657us       1.888us            12  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.472us        14.55%      13.472us       1.123us            12  
+                                            aten::clone         1.39%      20.311us        36.25%     531.032us      88.505us       0.000us         0.00%       7.904us       1.317us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.784us         7.33%       6.784us       1.131us             6  
+                                              aten::add         2.30%      33.730us         3.98%      58.302us       9.717us       6.752us         7.29%       6.752us       1.125us             6  
+                                              aten::sub         2.57%      37.640us         4.45%      65.262us      10.877us       6.720us         7.26%       6.720us       1.120us             6  
+                                Activity Buffer Request        14.75%     216.135us        14.75%     216.135us     216.135us       1.120us         1.21%       1.120us       1.120us             1  
+                                    aten::empty_strided         2.59%      37.931us         2.59%      37.931us       6.322us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        15.29%     223.986us        15.29%     223.986us      37.331us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.89%      71.623us         6.23%      91.274us       3.803us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.34%      19.651us         1.34%      19.651us       0.819us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        16.53%     242.131us        16.53%     242.131us       5.044us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.34%       5.040us         0.34%       5.040us       5.040us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.382ms
-Self CUDA time total: 92.703us
+Self CPU time total: 1.465ms
+Self CUDA time total: 92.578us
 
 
 
@@ -4389,27 +4389,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     938.901us       973.14%     938.901us     938.901us             1  
-                                            torch_eager        11.77%     313.313us        99.82%       2.656ms       2.656ms       0.000us         0.00%      97.825us      97.825us             1  
-                                              aten::mul         5.60%     148.957us         9.78%     260.340us      10.847us      51.266us        53.14%      51.266us       2.136us            24  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      51.266us        53.14%      51.266us       2.136us            24  
-                                            aten::copy_         3.87%     103.023us        68.29%       1.817ms     100.957us      30.976us        32.11%      32.319us       1.795us            18  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      23.072us        23.91%      23.072us       1.923us            12  
-void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.240us        14.76%      14.240us       1.187us            12  
-                                            aten::clone         1.07%      28.429us        65.69%       1.748ms     291.327us       0.000us         0.00%       9.247us       1.541us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.904us         8.19%       7.904us       1.317us             6  
-                                              aten::add         1.24%      33.110us         2.10%      56.011us       9.335us       7.137us         7.40%       7.137us       1.189us             6  
-                                              aten::sub         1.37%      36.490us         2.25%      59.790us       9.965us       7.103us         7.36%       7.103us       1.184us             6  
-                                Activity Buffer Request        53.84%       1.433ms        53.84%       1.433ms       1.433ms       1.343us         1.39%       1.343us       1.343us             1  
-                                    aten::empty_strided         1.19%      31.751us         1.19%      31.751us       5.292us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         8.25%     219.470us         8.25%     219.470us      36.578us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.63%      69.934us         3.35%      89.134us       3.714us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.72%      19.200us         0.72%      19.200us       0.800us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.25%     219.576us         8.25%     219.576us       4.574us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.18%       4.910us         0.18%       4.910us       4.910us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     963.643us      1001.81%     963.643us     963.643us             1  
+                                            torch_eager        11.60%     311.071us        99.82%       2.676ms       2.676ms       0.000us         0.00%      97.534us      97.534us             1  
+                                              aten::mul         5.66%     151.593us        10.00%     268.127us      11.172us      51.103us        53.13%      51.103us       2.129us            24  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      51.103us        53.13%      51.103us       2.129us            24  
+                                            aten::copy_         3.93%     105.441us        68.13%       1.826ms     101.459us      30.911us        32.14%      32.255us       1.792us            18  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      23.007us        23.92%      23.007us       1.917us            12  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.176us        14.74%      14.176us       1.181us            12  
+                                            aten::clone         1.04%      27.830us        65.21%       1.748ms     291.325us       0.000us         0.00%       9.248us       1.541us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.904us         8.22%       7.904us       1.317us             6  
+                                              aten::sub         1.38%      37.040us         2.30%      61.581us      10.264us       7.103us         7.38%       7.103us       1.184us             6  
+                                              aten::add         1.19%      32.000us         2.05%      54.860us       9.143us       7.073us         7.35%       7.073us       1.179us             6  
+                                Activity Buffer Request        53.57%       1.436ms        53.57%       1.436ms       1.436ms       1.344us         1.40%       1.344us       1.344us             1  
+                                    aten::empty_strided         1.19%      31.921us         1.19%      31.921us       5.320us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.14%     218.236us         8.14%     218.236us      36.373us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.76%      74.059us         3.52%      94.290us       3.929us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.75%      20.231us         0.75%      20.231us       0.843us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.60%     230.408us         8.60%     230.408us       4.800us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.18%       4.700us         0.18%       4.700us       4.700us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.661ms
-Self CUDA time total: 96.482us
+Self CPU time total: 2.681ms
+Self CUDA time total: 96.190us
 
 
 
@@ -4419,27 +4419,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     932.446us       897.69%     932.446us     932.446us             1  
-                                            torch_eager        11.60%     307.685us        99.81%       2.647ms       2.647ms       0.000us         0.00%     105.184us     105.184us             1  
-                                              aten::mul         5.51%     146.123us         9.64%     255.679us      10.653us      55.362us        53.30%      55.362us       2.307us            24  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      55.362us        53.30%      55.362us       2.307us            24  
-                                            aten::copy_         3.78%     100.194us        68.64%       1.821ms     101.144us      32.478us        31.27%      33.790us       1.877us            18  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.703us        23.78%      24.703us       2.059us            12  
-void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.032us        15.43%      16.032us       1.336us            12  
-                                            aten::clone         1.02%      27.179us        65.92%       1.748ms     291.378us       0.000us         0.00%       9.087us       1.515us             6  
-                                              aten::add         1.19%      31.489us         2.03%      53.840us       8.973us       8.064us         7.76%       8.064us       1.344us             6  
-                                              aten::sub         1.35%      35.692us         2.26%      59.843us       9.974us       7.968us         7.67%       7.968us       1.328us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.775us         7.49%       7.775us       1.296us             6  
-                                Activity Buffer Request        54.18%       1.437ms        54.18%       1.437ms       1.437ms       1.312us         1.26%       1.312us       1.312us             1  
-                                    aten::empty_strided         1.21%      32.003us         1.21%      32.003us       5.334us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         8.25%     218.717us         8.25%     218.717us      36.453us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.67%      70.760us         3.41%      90.371us       3.765us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.74%      19.611us         0.74%      19.611us       0.817us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.32%     220.800us         8.32%     220.800us       4.600us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.19%       5.070us         0.19%       5.070us       5.070us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     984.120us       950.08%     984.120us     984.120us             1  
+                                            torch_eager        21.32%     307.609us        99.66%       1.438ms       1.438ms       0.000us         0.00%     104.863us     104.863us             1  
+                                              aten::mul        11.11%     160.241us        19.03%     274.535us      11.439us      55.232us        53.32%      55.232us       2.301us            24  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      55.232us        53.32%      55.232us       2.301us            24  
+                                            aten::copy_         7.56%     109.063us        40.34%     581.983us      32.332us      32.383us        31.26%      33.663us       1.870us            18  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.639us        23.79%      24.639us       2.053us            12  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      15.968us        15.42%      15.968us       1.331us            12  
+                                            aten::clone         1.50%      21.672us        34.18%     493.044us      82.174us       0.000us         0.00%       9.024us       1.504us             6  
+                                              aten::add         2.60%      37.520us         4.33%      62.511us      10.418us       8.031us         7.75%       8.031us       1.339us             6  
+                                              aten::sub         2.72%      39.231us         4.56%      65.841us      10.973us       7.937us         7.66%       7.937us       1.323us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.744us         7.48%       7.744us       1.291us             6  
+                                Activity Buffer Request        13.05%     188.244us        13.05%     188.244us     188.244us       1.280us         1.24%       1.280us       1.280us             1  
+                                    aten::empty_strided         2.28%      32.882us         2.28%      32.882us       5.480us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        14.94%     215.555us        14.94%     215.555us      35.926us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.93%      71.162us         6.28%      90.612us       3.776us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.35%      19.450us         1.35%      19.450us       0.810us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        16.29%     235.016us        16.29%     235.016us       4.896us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.34%       4.880us         0.34%       4.880us       4.880us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.652ms
-Self CUDA time total: 103.872us
+Self CPU time total: 1.443ms
+Self CUDA time total: 103.583us
 
 
 
@@ -4449,27 +4449,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     914.130us       736.81%     914.130us     914.130us             1  
-                                            torch_eager        19.76%     284.015us        99.65%       1.432ms       1.432ms       0.000us         0.00%     125.858us     125.858us             1  
-                                              aten::mul        10.20%     146.586us        17.70%     254.419us      10.601us      65.313us        52.64%      65.313us       2.721us            24  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      65.313us        52.64%      65.313us       2.721us            24  
-                                            aten::copy_         7.71%     110.793us        44.82%     644.172us      35.787us      39.489us        31.83%      41.281us       2.293us            18  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.961us        23.34%      28.961us       2.413us            12  
-void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.264us        15.53%      19.264us       1.605us            12  
-                                            aten::clone         1.45%      20.820us        39.14%     562.560us      93.760us       0.000us         0.00%      12.320us       2.053us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.528us         8.49%      10.528us       1.755us             6  
-                                              aten::add         2.34%      33.572us         3.91%      56.142us       9.357us       9.664us         7.79%       9.664us       1.611us             6  
-                                              aten::sub         2.40%      34.530us         4.02%      57.751us       9.625us       9.600us         7.74%       9.600us       1.600us             6  
-                                Activity Buffer Request        17.82%     256.078us        17.82%     256.078us     256.078us       1.792us         1.44%       1.792us       1.792us             1  
-                                    aten::empty_strided         2.04%      29.262us         2.04%      29.262us       4.877us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        14.99%     215.437us        14.99%     215.437us      35.906us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.63%      66.508us         5.96%      85.660us       3.569us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.33%      19.152us         1.33%      19.152us       0.798us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        14.99%     215.488us        14.99%     215.488us       4.489us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.35%       5.000us         0.35%       5.000us       5.000us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     935.122us       757.84%     935.122us     935.122us             1  
+                                            torch_eager        19.99%     283.519us        99.60%       1.412ms       1.412ms       0.000us         0.00%     125.153us     125.153us             1  
+                                              aten::mul        10.97%     155.634us        18.77%     266.135us      11.089us      65.024us        52.70%      65.024us       2.709us            24  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      65.024us        52.70%      65.024us       2.709us            24  
+                                            aten::copy_         7.53%     106.809us        43.10%     611.203us      33.956us      39.201us        31.77%      40.961us       2.276us            18  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.769us        23.31%      28.769us       2.397us            12  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.168us        15.53%      19.168us       1.597us            12  
+                                            aten::clone         1.50%      21.262us        37.00%     524.722us      87.454us       0.000us         0.00%      12.192us       2.032us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.432us         8.45%      10.432us       1.739us             6  
+                                              aten::add         2.41%      34.151us         3.94%      55.922us       9.320us       9.664us         7.83%       9.664us       1.611us             6  
+                                              aten::sub         2.49%      35.371us         4.21%      59.711us       9.952us       9.504us         7.70%       9.504us       1.584us             6  
+                                Activity Buffer Request        14.55%     206.375us        14.55%     206.375us     206.375us       1.760us         1.43%       1.760us       1.760us             1  
+                                    aten::empty_strided         2.12%      30.049us         2.12%      30.049us       5.008us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        16.20%     229.735us        16.20%     229.735us      38.289us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.63%      65.693us         5.97%      84.623us       3.526us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.33%      18.930us         1.33%      18.930us       0.789us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.86%     224.896us        15.86%     224.896us       4.685us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.40%       5.729us         0.40%       5.729us       5.729us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.437ms
-Self CUDA time total: 124.066us
+Self CPU time total: 1.418ms
+Self CUDA time total: 123.393us
 
 
 
@@ -4479,27 +4479,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     921.138us       886.26%     921.138us     921.138us             1  
-                                            torch_eager        20.59%     281.307us        99.64%       1.361ms       1.361ms       0.000us         0.00%     105.280us     105.280us             1  
-                                              aten::mul        10.84%     148.087us        18.91%     258.361us      10.765us      55.487us        53.39%      55.487us       2.312us            24  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      55.487us        53.39%      55.487us       2.312us            24  
-                                            aten::copy_         7.39%     100.946us        41.35%     564.842us      31.380us      32.481us        31.25%      33.825us       1.879us            18  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.640us        23.71%      24.640us       2.053us            12  
-void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      15.968us        15.36%      15.968us       1.331us            12  
-                                            aten::clone         1.54%      21.041us        35.66%     487.118us      81.186us       0.000us         0.00%       9.185us       1.531us             6  
-                                              aten::sub         2.75%      37.531us         4.47%      61.012us      10.169us       8.031us         7.73%       8.031us       1.339us             6  
-                                              aten::add         2.35%      32.112us         3.97%      54.222us       9.037us       7.937us         7.64%       7.937us       1.323us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.841us         7.54%       7.841us       1.307us             6  
-                                Activity Buffer Request        13.62%     186.046us        13.62%     186.046us     186.046us       1.344us         1.29%       1.344us       1.344us             1  
-                                    aten::empty_strided         2.20%      30.110us         2.20%      30.110us       5.018us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        15.76%     215.337us        15.76%     215.337us      35.890us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         5.18%      70.704us         6.60%      90.193us       3.758us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.43%      19.489us         1.43%      19.489us       0.812us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        15.99%     218.378us        15.99%     218.378us       4.550us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.36%       4.960us         0.36%       4.960us       4.960us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     964.101us       931.86%     964.101us     964.101us             1  
+                                            torch_eager        11.58%     311.269us        99.80%       2.682ms       2.682ms       0.000us         0.00%     104.772us     104.772us             1  
+                                              aten::mul         5.74%     154.165us         9.94%     267.067us      11.128us      55.236us        53.39%      55.236us       2.301us            24  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      55.236us        53.39%      55.236us       2.301us            24  
+                                            aten::copy_         4.07%     109.351us        68.30%       1.836ms     101.989us      32.287us        31.21%      33.599us       1.867us            18  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.511us        23.69%      24.511us       2.043us            12  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      15.937us        15.40%      15.937us       1.328us            12  
+                                            aten::clone         1.02%      27.532us        65.06%       1.749ms     291.482us       0.000us         0.00%       9.088us       1.515us             6  
+                                              aten::add         1.31%      35.310us         2.20%      59.141us       9.857us       7.969us         7.70%       7.969us       1.328us             6  
+                                              aten::sub         1.38%      37.131us         2.33%      62.602us      10.434us       7.968us         7.70%       7.968us       1.328us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.776us         7.52%       7.776us       1.296us             6  
+                                Activity Buffer Request        53.54%       1.439ms        53.54%       1.439ms       1.439ms       1.312us         1.27%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.12%      30.190us         1.12%      30.190us       5.032us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.09%     217.335us         8.09%     217.335us      36.223us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.62%      70.291us         3.31%      88.901us       3.704us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.69%      18.610us         0.69%      18.610us       0.775us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.64%     232.137us         8.64%     232.137us       4.836us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.20%       5.481us         0.20%       5.481us       5.481us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.366ms
-Self CUDA time total: 103.936us
+Self CPU time total: 2.688ms
+Self CUDA time total: 103.460us
 
 
 
@@ -4509,27 +4509,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     943.466us       759.69%     943.466us     943.466us             1  
-                                            torch_eager        21.73%     302.071us        99.63%       1.385ms       1.385ms       0.000us         0.00%     125.950us     125.950us             1  
-                                              aten::mul        10.55%     146.657us        18.63%     259.039us      10.793us      65.378us        52.64%      65.378us       2.724us            24  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      65.378us        52.64%      65.378us       2.724us            24  
-                                            aten::copy_         7.63%     106.103us        41.12%     571.631us      31.757us      39.519us        31.82%      41.278us       2.293us            18  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      29.024us        23.37%      29.024us       2.419us            12  
-void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.294us        15.54%      19.294us       1.608us            12  
-                                            aten::clone         1.52%      21.080us        35.11%     488.057us      81.343us       0.000us         0.00%      12.254us       2.042us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.495us         8.45%      10.495us       1.749us             6  
-                                              aten::sub         2.46%      34.153us         4.15%      57.634us       9.606us       9.727us         7.83%       9.727us       1.621us             6  
-                                              aten::add         2.41%      33.450us         4.05%      56.342us       9.390us       9.567us         7.70%       9.567us       1.595us             6  
-                                Activity Buffer Request        13.70%     190.466us        13.70%     190.466us     190.466us       1.759us         1.42%       1.759us       1.759us             1  
-                                    aten::empty_strided         2.14%      29.791us         2.14%      29.791us       4.965us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        15.29%     212.610us        15.29%     212.610us      35.435us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.88%      67.802us         6.29%      87.511us       3.646us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.42%      19.709us         1.42%      19.709us       0.821us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        15.91%     221.207us        15.91%     221.207us       4.608us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.37%       5.080us         0.37%       5.080us       5.080us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     964.072us       780.68%     964.072us     964.072us             1  
+                                            torch_eager        11.45%     316.268us        99.81%       2.758ms       2.758ms       0.000us         0.00%     125.283us     125.283us             1  
+                                              aten::mul         5.46%     150.776us         9.46%     261.336us      10.889us      65.090us        52.71%      65.090us       2.712us            24  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      65.090us        52.71%      65.090us       2.712us            24  
+                                            aten::copy_         3.85%     106.511us        68.83%       1.902ms     105.647us      39.266us        31.80%      41.058us       2.281us            18  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.802us        23.32%      28.802us       2.400us            12  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.135us        15.50%      19.135us       1.595us            12  
+                                            aten::clone         1.09%      30.231us        66.11%       1.827ms     304.441us       0.000us         0.00%      12.256us       2.043us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.464us         8.47%      10.464us       1.744us             6  
+                                              aten::add         1.22%      33.650us         2.08%      57.431us       9.572us       9.599us         7.77%       9.599us       1.600us             6  
+                                              aten::sub         1.35%      37.292us         2.48%      68.652us      11.442us       9.536us         7.72%       9.536us       1.589us             6  
+                                Activity Buffer Request        54.53%       1.507ms        54.53%       1.507ms       1.507ms       1.792us         1.45%       1.792us       1.792us             1  
+                                    aten::empty_strided         1.19%      32.821us         1.19%      32.821us       5.470us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.01%     221.424us         8.01%     221.424us      36.904us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.55%      70.592us         3.23%      89.363us       3.723us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.68%      18.771us         0.68%      18.771us       0.782us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.42%     232.664us         8.42%     232.664us       4.847us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.19%       5.190us         0.19%       5.190us       5.190us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.390ms
-Self CUDA time total: 124.191us
+Self CPU time total: 2.763ms
+Self CUDA time total: 123.491us
 
 
 
@@ -4539,27 +4539,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     909.497us       512.75%     909.497us     909.497us             1  
-                                            torch_eager        20.85%     278.298us        99.63%       1.330ms       1.330ms       0.000us         0.00%     180.288us     180.288us             1  
-                                              aten::mul        10.86%     144.977us        19.10%     254.920us      10.622us      94.591us        53.33%      94.591us       3.941us            24  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      94.591us        53.33%      94.591us       3.941us            24  
-                                            aten::copy_         7.76%     103.603us        40.90%     545.870us      30.326us      57.919us        32.65%      60.831us       3.380us            18  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.767us        22.98%      40.767us       3.397us            12  
-void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.866us        14.02%      24.866us       2.072us            12  
-                                            aten::clone         1.59%      21.200us        34.96%     466.526us      77.754us       0.000us         0.00%      20.064us       3.344us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.152us         9.67%      17.152us       2.859us             6  
-                                              aten::sub         2.64%      35.242us         4.38%      58.452us       9.742us      12.450us         7.02%      12.450us       2.075us             6  
-                                              aten::add         2.38%      31.821us         4.13%      55.081us       9.180us      12.416us         7.00%      12.416us       2.069us             6  
-                                Activity Buffer Request        12.93%     172.606us        12.93%     172.606us     172.606us       2.912us         1.64%       2.912us       2.912us             1  
-                                    aten::empty_strided         2.27%      30.341us         2.27%      30.341us       5.057us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        15.64%     208.798us        15.64%     208.798us      34.800us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.99%      66.616us         6.40%      85.475us       3.561us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.41%      18.859us         1.41%      18.859us       0.786us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        16.28%     217.276us        16.28%     217.276us       4.527us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.37%       5.001us         0.37%       5.001us       5.001us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     934.855us       527.63%     934.855us     934.855us             1  
+                                            torch_eager        19.51%     283.728us        99.66%       1.450ms       1.450ms       0.000us         0.00%     180.061us     180.061us             1  
+                                              aten::mul        10.43%     151.748us        18.10%     263.338us      10.972us      95.007us        53.62%      95.007us       3.959us            24  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      95.007us        53.62%      95.007us       3.959us            24  
+                                            aten::copy_         7.11%     103.461us        44.35%     645.065us      35.837us      57.664us        32.55%      60.544us       3.364us            18  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.608us        22.92%      40.608us       3.384us            12  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.510us        13.83%      24.510us       2.042us            12  
+                                            aten::clone         1.46%      21.280us        38.39%     558.424us      93.071us       0.000us         0.00%      19.936us       3.323us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.056us         9.63%      17.056us       2.843us             6  
+                                              aten::add         2.36%      34.271us         3.99%      58.001us       9.667us      12.287us         6.93%      12.287us       2.048us             6  
+                                              aten::sub         2.55%      37.161us         4.24%      61.641us      10.274us      12.223us         6.90%      12.223us       2.037us             6  
+                                Activity Buffer Request        17.53%     255.006us        17.53%     255.006us     255.006us       2.880us         1.63%       2.880us       2.880us             1  
+                                    aten::empty_strided         2.02%      29.311us         2.02%      29.311us       4.885us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        15.21%     221.267us        15.21%     221.267us      36.878us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.73%      68.750us         6.01%      87.372us       3.641us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.28%      18.622us         1.28%      18.622us       0.776us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.48%     225.131us        15.48%     225.131us       4.690us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.34%       4.880us         0.34%       4.880us       4.880us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.335ms
-Self CUDA time total: 177.376us
+Self CPU time total: 1.455ms
+Self CUDA time total: 177.181us
 
 
 
@@ -4569,27 +4569,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     908.914us       305.78%     908.914us     908.914us             1  
-                                            torch_eager        20.55%     283.527us        99.64%       1.375ms       1.375ms       0.000us         0.00%     314.296us     314.296us             1  
-                                              aten::mul        10.61%     146.340us        18.54%     255.803us      10.658us     145.086us        48.81%     145.086us       6.045us            24  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     145.086us        48.81%     145.086us       6.045us            24  
-                                            aten::copy_         7.34%     101.324us        42.67%     588.790us      32.711us     111.099us        37.38%     128.154us       7.120us            18  
-                                            aten::clone         1.50%      20.722us        37.09%     511.699us      85.283us       0.000us         0.00%      70.718us      11.786us             6  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.436us        19.32%      57.436us       4.786us            12  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.663us        18.05%      53.663us       8.944us             6  
-void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      41.056us        13.81%      41.056us       3.421us            12  
-                                              aten::sub         2.49%      34.330us         4.16%      57.351us       9.558us      20.672us         6.95%      20.672us       3.445us             6  
-                                              aten::add         2.29%      31.611us         3.89%      53.723us       8.954us      20.384us         6.86%      20.384us       3.397us             6  
-                                Activity Buffer Request        15.84%     218.487us        15.84%     218.487us     218.487us      17.055us         5.74%      17.055us      17.055us             1  
-                                    aten::empty_strided         2.18%      30.110us         2.18%      30.110us       5.018us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        15.10%     208.357us        15.10%     208.357us      34.726us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.74%      65.442us         6.15%      84.803us       3.533us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.40%      19.361us         1.40%      19.361us       0.807us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        15.60%     215.218us        15.60%     215.218us       4.484us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.36%       4.930us         0.36%       4.930us       4.930us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     936.902us       314.34%     936.902us     936.902us             1  
+                                            torch_eager        19.95%     279.505us        99.63%       1.396ms       1.396ms       0.000us         0.00%     315.267us     315.267us             1  
+                                              aten::mul        10.85%     152.079us        18.94%     265.395us      11.058us     146.176us        49.04%     146.176us       6.091us            24  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     146.176us        49.04%     146.176us       6.091us            24  
+                                            aten::copy_         7.66%     107.385us        42.60%     596.937us      33.163us     110.978us        37.23%     128.194us       7.122us            18  
+                                            aten::clone         1.45%      20.319us        36.31%     508.783us      84.797us       0.000us         0.00%      70.625us      11.771us             6  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.569us        19.32%      57.569us       4.797us            12  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.409us        17.92%      53.409us       8.902us             6  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      40.897us        13.72%      40.897us       3.408us            12  
+                                              aten::sub         2.61%      36.531us         4.38%      61.402us      10.234us      20.449us         6.86%      20.449us       3.408us             6  
+                                              aten::add         2.39%      33.533us         3.98%      55.753us       9.292us      20.448us         6.86%      20.448us       3.408us             6  
+                                Activity Buffer Request        14.75%     206.705us        14.75%     206.705us     206.705us      17.216us         5.78%      17.216us      17.216us             1  
+                                    aten::empty_strided         2.13%      29.842us         2.13%      29.842us       4.974us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        15.44%     216.385us        15.44%     216.385us      36.064us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.91%      68.874us         6.21%      87.042us       3.627us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.30%      18.168us         1.30%      18.168us       0.757us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        16.19%     226.869us        16.19%     226.869us       4.726us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.37%       5.161us         0.37%       5.161us       5.161us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.380ms
-Self CUDA time total: 297.241us
+Self CPU time total: 1.401ms
+Self CUDA time total: 298.051us
 
 
 
@@ -4599,27 +4599,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     939.162us       529.48%     939.162us     939.162us             1  
-                                            torch_eager        11.57%     307.472us        99.80%       2.653ms       2.653ms       0.000us         0.00%     180.256us     180.256us             1  
-                                              aten::mul         5.55%     147.649us         9.66%     256.649us      10.694us      94.851us        53.47%      94.851us       3.952us            24  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      94.851us        53.47%      94.851us       3.952us            24  
-                                            aten::copy_         3.85%     102.292us        68.52%       1.821ms     101.186us      57.759us        32.56%      60.639us       3.369us            18  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.671us        22.93%      40.671us       3.389us            12  
-void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.766us        13.96%      24.766us       2.064us            12  
-                                            aten::clone         1.06%      28.080us        65.81%       1.749ms     291.547us       0.000us         0.00%      19.968us       3.328us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.088us         9.63%      17.088us       2.848us             6  
-                                              aten::add         1.13%      30.133us         1.96%      52.053us       8.675us      12.384us         6.98%      12.384us       2.064us             6  
-                                              aten::sub         1.27%      33.752us         2.15%      57.162us       9.527us      12.382us         6.98%      12.382us       2.064us             6  
-                                Activity Buffer Request        54.50%       1.449ms        54.50%       1.449ms       1.449ms       2.880us         1.62%       2.880us       2.880us             1  
-                                    aten::empty_strided         1.13%      30.142us         1.13%      30.142us       5.024us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         7.84%     208.428us         7.84%     208.428us      34.738us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.02%      80.309us         3.76%      99.911us       4.163us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.74%      19.602us         0.74%      19.602us       0.817us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.14%     216.293us         8.14%     216.293us       4.506us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.20%       5.200us         0.20%       5.200us       5.200us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     953.069us       538.57%     953.069us     953.069us             1  
+                                            torch_eager        19.36%     280.983us        99.62%       1.446ms       1.446ms       0.000us         0.00%     179.812us     179.812us             1  
+                                              aten::mul        10.74%     155.876us        18.65%     270.688us      11.279us      94.916us        53.64%      94.916us       3.955us            24  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      94.916us        53.64%      94.916us       3.955us            24  
+                                            aten::copy_         7.70%     111.823us        43.62%     633.117us      35.173us      57.568us        32.53%      60.416us       3.356us            18  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.544us        22.91%      40.544us       3.379us            12  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.480us        13.83%      24.480us       2.040us            12  
+                                            aten::clone         1.50%      21.731us        37.58%     545.384us      90.897us       0.000us         0.00%      19.872us       3.312us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.024us         9.62%      17.024us       2.837us             6  
+                                              aten::add         2.38%      34.509us         4.05%      58.781us       9.797us      12.256us         6.93%      12.256us       2.043us             6  
+                                              aten::sub         2.51%      36.442us         4.13%      59.923us       9.987us      12.224us         6.91%      12.224us       2.037us             6  
+                                Activity Buffer Request        15.40%     223.485us        15.40%     223.485us     223.485us       2.848us         1.61%       2.848us       2.848us             1  
+                                    aten::empty_strided         2.13%      30.930us         2.13%      30.930us       5.155us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        15.79%     229.197us        15.79%     229.197us      38.200us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.88%      70.882us         6.18%      89.652us       3.735us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.29%      18.770us         1.29%      18.770us       0.782us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.93%     231.177us        15.93%     231.177us       4.816us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.38%       5.510us         0.38%       5.510us       5.510us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.658ms
-Self CUDA time total: 177.376us
+Self CPU time total: 1.451ms
+Self CUDA time total: 176.964us
 
 
 
@@ -4629,27 +4629,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     942.515us       317.36%     942.515us     942.515us             1  
-                                            torch_eager        20.57%     285.923us        99.62%       1.385ms       1.385ms       0.000us         0.00%     314.717us     314.717us             1  
-                                              aten::mul        10.73%     149.116us        18.62%     258.870us      10.786us     145.439us        48.97%     145.439us       6.060us            24  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     145.439us        48.97%     145.439us       6.060us            24  
-                                            aten::copy_         7.46%     103.659us        42.33%     588.488us      32.694us     110.749us        37.29%     128.477us       7.138us            18  
-                                            aten::clone         1.56%      21.753us        36.61%     508.959us      84.826us       0.000us         0.00%      71.104us      11.851us             6  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.373us        19.32%      57.373us       4.781us            12  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.376us        17.97%      53.376us       8.896us             6  
-void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      40.801us        13.74%      40.801us       3.400us            12  
-                                              aten::sub         2.38%      33.081us         4.03%      56.021us       9.337us      20.449us         6.89%      20.449us       3.408us             6  
-                                              aten::add         2.40%      33.331us         4.05%      56.271us       9.379us      20.352us         6.85%      20.352us       3.392us             6  
-                                Activity Buffer Request        14.18%     197.118us        14.18%     197.118us     197.118us      17.728us         5.97%      17.728us      17.728us             1  
-                                    aten::empty_strided         2.21%      30.780us         2.21%      30.780us       5.130us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        16.19%     225.018us        16.19%     225.018us      37.503us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.87%      67.722us         6.24%      86.713us       3.613us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.37%      18.991us         1.37%      18.991us       0.791us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        15.71%     218.327us        15.71%     218.327us       4.548us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.38%       5.310us         0.38%       5.310us       5.310us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     992.756us       332.77%     992.756us     992.756us             1  
+                                            torch_eager        20.12%     289.006us        99.66%       1.432ms       1.432ms       0.000us         0.00%     316.222us     316.222us             1  
+                                              aten::mul        11.31%     162.528us        19.47%     279.759us      11.657us     146.880us        49.23%     146.880us       6.120us            24  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     146.880us        49.23%     146.880us       6.120us            24  
+                                            aten::copy_         7.73%     111.012us        41.48%     595.895us      33.105us     110.942us        37.19%     128.830us       7.157us            18  
+                                            aten::clone         1.55%      22.310us        35.21%     505.793us      84.299us       0.000us         0.00%      71.424us      11.904us             6  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.406us        19.24%      57.406us       4.784us            12  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.536us        17.94%      53.536us       8.923us             6  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      40.512us        13.58%      40.512us       3.376us            12  
+                                              aten::add         2.53%      36.289us         4.25%      61.011us      10.169us      20.352us         6.82%      20.352us       3.392us             6  
+                                              aten::sub         2.59%      37.162us         4.41%      63.291us      10.549us      20.160us         6.76%      20.160us       3.360us             6  
+                                Activity Buffer Request        13.10%     188.164us        13.10%     188.164us     188.164us      17.888us         6.00%      17.888us      17.888us             1  
+                                    aten::empty_strided         2.24%      32.121us         2.24%      32.121us       5.354us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        15.74%     226.067us        15.74%     226.067us      37.678us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.81%      69.111us         6.15%      88.363us       3.682us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.34%      19.252us         1.34%      19.252us       0.802us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        16.62%     238.734us        16.62%     238.734us       4.974us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.34%       4.940us         0.34%       4.940us       4.940us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.390ms
-Self CUDA time total: 296.989us
+Self CPU time total: 1.437ms
+Self CUDA time total: 298.334us
 
 
 
@@ -4659,27 +4659,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     928.214us       158.30%     928.214us     928.214us             1  
-                                            torch_eager        21.21%     285.194us        99.61%       1.340ms       1.340ms       0.000us         0.00%     610.012us     610.012us             1  
-                                            aten::copy_         7.59%     102.047us        40.19%     540.521us      30.029us     268.445us        45.78%     292.093us      16.227us            18  
-                                              aten::mul        11.07%     148.860us        19.42%     261.184us      10.883us     251.679us        42.92%     251.679us      10.487us            24  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     251.679us        42.92%     251.679us      10.487us            24  
-                                            aten::clone         1.57%      21.069us        34.26%     460.696us      76.783us       0.000us         0.00%     201.406us      33.568us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us     177.758us        30.32%     177.758us      29.626us             6  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      90.687us        15.47%      90.687us       7.557us            12  
-void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      66.240us        11.30%      66.240us       5.520us            12  
-                                              aten::sub         2.72%      36.642us         4.50%      60.582us      10.097us      33.152us         5.65%      33.152us       5.525us             6  
-                                              aten::add         2.29%      30.800us         3.93%      52.901us       8.817us      33.088us         5.64%      33.088us       5.515us             6  
-                                Activity Buffer Request        12.31%     165.596us        12.31%     165.596us     165.596us      23.648us         4.03%      23.648us      23.648us             1  
-                                    aten::empty_strided         2.19%      29.501us         2.19%      29.501us       4.917us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        15.63%     210.266us        15.63%     210.266us      35.044us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         5.16%      69.374us         6.60%      88.734us       3.697us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.44%      19.360us         1.44%      19.360us       0.807us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        16.43%     220.977us        16.43%     220.977us       4.604us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.39%       5.180us         0.39%       5.180us       5.180us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     957.657us       163.29%     957.657us     957.657us             1  
+                                            torch_eager        20.09%     288.813us        99.63%       1.432ms       1.432ms       0.000us         0.00%     610.425us     610.425us             1  
+                                            aten::copy_         7.31%     105.011us        42.63%     612.724us      34.040us     268.572us        45.79%     292.508us      16.250us            18  
+                                              aten::mul        10.71%     153.870us        18.84%     270.776us      11.282us     252.607us        43.07%     252.607us      10.525us            24  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     252.607us        43.07%     252.607us      10.525us            24  
+                                            aten::clone         1.42%      20.480us        36.58%     525.692us      87.615us       0.000us         0.00%     201.566us      33.594us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us     177.630us        30.29%     177.630us      29.605us             6  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      90.942us        15.51%      90.942us       7.578us            12  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      65.310us        11.14%      65.310us       5.443us            12  
+                                              aten::sub         2.69%      38.720us         4.45%      63.991us      10.665us      32.991us         5.63%      32.991us       5.499us             6  
+                                              aten::add         2.37%      34.041us         3.93%      56.461us       9.410us      32.319us         5.51%      32.319us       5.387us             6  
+                                Activity Buffer Request        15.99%     229.866us        15.99%     229.866us     229.866us      23.936us         4.08%      23.936us      23.936us             1  
+                                    aten::empty_strided         2.02%      29.010us         2.02%      29.010us       4.835us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        14.72%     211.585us        14.72%     211.585us      35.264us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.83%      69.478us         6.24%      89.671us       3.736us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.40%      20.193us         1.40%      20.193us       0.841us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        16.06%     230.859us        16.06%     230.859us       4.810us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.37%       5.320us         0.37%       5.320us       5.320us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.345ms
-Self CUDA time total: 586.364us
+Self CPU time total: 1.437ms
+Self CUDA time total: 586.489us
 
 
 
@@ -4689,35 +4689,35 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         9.32%     323.657us        76.63%       2.662ms       2.662ms       0.000us         0.00%       1.834ms       1.834ms             1  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.806ms       102.11%       1.806ms       1.806ms             1  
-                                            aten::copy_         3.12%     108.276us        52.46%       1.822ms     101.225us     791.134us        44.74%     857.278us      47.627us            18  
-                                              aten::mul         4.16%     144.572us         7.37%     256.109us      10.671us     827.198us        46.78%     827.198us      34.467us            24  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     827.198us        46.78%     827.198us      34.467us            24  
-                                            aten::clone         0.81%      28.142us        50.15%       1.742ms     290.300us       0.000us         0.00%     624.095us     104.016us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us     557.951us        31.55%     557.951us      92.992us             6  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     233.183us        13.19%     233.183us      19.432us            12  
-void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     149.919us         8.48%     149.919us      12.493us            12  
-                                              aten::sub         0.98%      34.102us         1.65%      57.362us       9.560us      90.368us         5.11%      90.368us      15.061us             6  
-                                Activity Buffer Request        41.53%       1.443ms        41.53%       1.443ms       1.443ms      66.144us         3.74%      66.144us      66.144us             1  
-                                              aten::add         0.89%      30.740us         1.53%      53.293us       8.882us      59.551us         3.37%      59.551us       9.925us             6  
-                                    aten::empty_strided         0.86%      29.871us         0.86%      29.871us       4.979us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         5.94%     206.426us         5.94%     206.426us      34.404us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.06%      71.442us         2.62%      91.034us       3.793us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.56%      19.592us         0.56%      19.592us       0.816us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         6.40%     222.192us         6.40%     222.192us       4.629us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize        23.37%     811.698us        23.37%     811.698us     811.698us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         9.43%     329.378us        77.87%       2.720ms       2.720ms       0.000us         0.00%       1.842ms       1.842ms             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.815ms       102.19%       1.815ms       1.815ms             1  
+                                            aten::copy_         3.09%     107.951us        52.68%       1.840ms     102.235us     794.051us        44.71%     860.068us      47.782us            18  
+                                              aten::mul         4.59%     160.365us         8.02%     279.997us      11.667us     834.368us        46.99%     834.368us      34.765us            24  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     834.368us        46.99%     834.368us      34.765us            24  
+                                            aten::clone         0.80%      28.034us        50.14%       1.751ms     291.882us       0.000us         0.00%     627.394us     104.566us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us     561.377us        31.61%     561.377us      93.563us             6  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     232.674us        13.10%     232.674us      19.389us            12  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     147.392us         8.30%     147.392us      12.283us            12  
+                                              aten::sub         1.14%      39.970us         1.89%      66.170us      11.028us      89.952us         5.07%      89.952us      14.992us             6  
+                                Activity Buffer Request        41.31%       1.443ms        41.31%       1.443ms       1.443ms      66.017us         3.72%      66.017us      66.017us             1  
+                                              aten::add         0.95%      33.281us         1.61%      56.271us       9.379us      57.440us         3.23%      57.440us       9.573us             6  
+                                    aten::empty_strided         0.85%      29.670us         0.85%      29.670us       4.945us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         6.22%     217.146us         6.22%     217.146us      36.191us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.01%      70.292us         2.58%      90.182us       3.758us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.57%      19.890us         0.57%      19.890us       0.829us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         6.90%     240.975us         6.90%     240.975us       5.020us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize        22.13%     773.090us        22.13%     773.090us     773.090us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 3.473ms
-Self CUDA time total: 1.768ms
+Self CPU time total: 3.493ms
+Self CUDA time total: 1.776ms
 
 
 impl                     wl                  p50(ms)  ok
 torch_eager              cuda_B1_S128_H32_D128_R64     0.22  True
 torch_eager              cuda_B1_S128_H32_D64_R32     0.22  True
 torch_eager              cuda_B1_S128_H8_D128_R64     0.23  True
-torch_eager              cuda_B1_S128_H8_D64_R32     0.17  True
-torch_eager              cuda_B1_S2048_H32_D128_R64     0.22  True
+torch_eager              cuda_B1_S128_H8_D64_R32     0.18  True
+torch_eager              cuda_B1_S2048_H32_D128_R64     0.23  True
 torch_eager              cuda_B1_S2048_H32_D64_R32     0.22  True
 torch_eager              cuda_B1_S2048_H8_D128_R64     0.22  True
 torch_eager              cuda_B1_S2048_H8_D64_R32     0.22  True
@@ -4735,7 +4735,7 @@ torch_eager              cuda_B2_S2048_H8_D128_R64     0.22  True
 torch_eager              cuda_B2_S2048_H8_D64_R32     0.22  True
 torch_eager              cuda_B2_S512_H32_D128_R64     0.22  True
 torch_eager              cuda_B2_S512_H32_D64_R32     0.22  True
-torch_eager              cuda_B2_S512_H8_D128_R64     0.22  True
+torch_eager              cuda_B2_S512_H8_D128_R64     0.21  True
 torch_eager              cuda_B2_S512_H8_D64_R32     0.22  True
 </pre></div>
 <div class="cell-artifacts">
diff --git a/rotary/index.html b/rotary/index.html
index cb1be8e4d680b5623caf2d05c1be684b075964b4..5ff503336b04c290f15ed24958b96a45568efad3 100644
--- a/rotary/index.html
+++ b/rotary/index.html
@@ -1,89 +1,3879 @@
 <!DOCTYPE html>
-<html>
+<html lang="en">
 <head>
-  <meta charset='UTF-8'>
-  <meta name='viewport' content='width=device-width, initial-scale=1.0'>
-  <title>Index of /rotary</title>
-  <style>
-    :root {
-      --bg-primary: #0a0a0a;
-      --bg-secondary: #121212;
-      --bg-tertiary: #181818;
-      --text-primary: #e0e0e0;
-      --text-secondary: #888888;
-      --text-link: #64b5f6;
-      --border-primary: #2a2a2a;
-    }
-    body {
-      font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
-      background: var(--bg-primary);
-      color: var(--text-primary);
-      margin: 0;
-      padding: 16px;
-      max-width: 900px;
-      margin: 0 auto;
-    }
-    .controls {
-      display: flex;
-      justify-content: flex-end;
-      margin-bottom: 1rem;
-    }
-    .back-button {
-      background: var(--bg-secondary);
-      border: 1px solid var(--border-primary);
-      padding: 8px 12px;
-      border-radius: 4px;
-      color: var(--text-secondary);
-      cursor: pointer;
-      font-size: 0.9rem;
-      text-decoration: none;
-      display: inline-block;
-    }
-    .back-button:hover {
-      color: var(--text-primary);
-      background: var(--bg-tertiary);
-    }
-    h1 {
-      font-size: 1.5em;
-      margin: 1rem 0;
-      color: var(--text-primary);
-      border-bottom: 1px solid var(--border-primary);
-      padding-bottom: 0.5rem;
-    }
-    ul {
-      list-style-type: none;
-      padding: 0;
-    }
-    li {
-      margin: 0;
-      border-bottom: 1px solid var(--border-primary);
-    }
-    li:last-child {
-      border-bottom: none;
-    }
-    a {
-      display: block;
-      padding: 0.75rem 0.5rem;
-      text-decoration: none;
-      color: var(--text-link);
-      transition: background 0.2s ease;
-    }
-    a:hover {
-      background: var(--bg-secondary);
-    }
-    .dir {
-      font-weight: 500;
-    }
-  </style>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>index</title>
+    <script>
+// Iframe-friendly navigation router
+        (function() {
+            const isIframe = window.self !== window.top;
+            if (!isIframe) return; // Only activate in iframe context
+
+            // On load: if hash points to a different page, navigate there
+            const hash = window.location.hash;
+            if (hash && hash.startsWith('#/')) {
+                const targetPath = hash.slice(2); // Remove '#/'
+                const currentPath = window.location.pathname.split('/').pop();
+
+                // Only navigate if we're not already on the target page
+                if (targetPath !== currentPath) {
+                    window.location.href = targetPath;
+                    return; // Stop execution, we're navigating away
+                }
+            }
+
+            // Intercept all link clicks for hash-based navigation
+            document.addEventListener('click', function(e) {
+                const link = e.target.closest('a');
+                if (!link) return;
+
+                const href = link.getAttribute('href');
+
+                // Skip external links, anchors, and javascript: links
+                if (!href || href.startsWith('#') || href.startsWith('http') || href.startsWith('javascript:')) {
+                    return;
+                }
+
+                e.preventDefault();
+
+                // Convert relative/absolute path to hash-based navigation
+                const url = new URL(href, window.location.href);
+                let fullPath = url.pathname;
+
+                // Remove leading slash if present for cleaner paths
+                if (fullPath.startsWith('/')) {
+                    fullPath = fullPath.slice(1);
+                }
+
+                // Update parent URL hash
+                window.location.hash = '#/' + fullPath;
+
+                // For HTML files, navigate within iframe
+                if (fullPath.endsWith('.html') || fullPath.endsWith('/')) {
+                    const pathParts = fullPath.split('/').filter(p => p);
+                    const targetFile = pathParts[pathParts.length - 1] || 'index.html';
+                    window.location.href = targetFile;
+                } else {
+                    // For non-HTML files (raw .py, etc), open directly
+                    window.open(href, '_blank');
+                }
+            });
+        })();
+
+        // Apply theme and widget visibility immediately to prevent flicker
+        (function() {
+            const configTheme = 'dark';
+            const hasConfigUi = false;
+            const configUi = hasConfigUi ? 'None' : null;
+            const hasWidgetsConfig = false;
+            const widgetsOn = hasWidgetsConfig ? false : true;
+            let theme;
+            if (configTheme === 'auto') {
+                theme = window.matchMedia('(prefers-color-scheme: dark)').matches ? 'dark' : 'light';
+            } else {
+                theme = localStorage.getItem('uvnote-theme') || configTheme;
+            }
+            document.documentElement.setAttribute('data-theme', theme);
+
+            // Initialize UI theme (css theme)
+            let ui = hasConfigUi ? configUi : (localStorage.getItem('uvnote-ui') || 'default');
+            if (ui !== 'default' && ui !== 'none' && ui !== 'monocolor') { ui = 'default'; }
+            document.documentElement.setAttribute('data-ui', ui);
+
+            // Apply widgets visibility
+            document.documentElement.setAttribute('data-widgets', widgetsOn ? 'on' : 'off');
+        })();
+    </script>
+    <style>
+:root[data-theme="light"] {
+            --bg-primary: #ffffff;
+            --bg-secondary: #f6f8fa;
+            --bg-tertiary: #f8f9fa;
+            --bg-code: #f8f9fa;
+            --bg-error: #fdf2f2;
+            --bg-artifact: #e6f3ff;
+            --bg-artifact-hover: #d0e7ff;
+            
+            --text-primary: #333;
+            --text-secondary: #656d76;
+            --text-error: #c53030;
+            --text-link: #0969da;
+            
+            --border-primary: #e1e5e9;
+            --border-error: #e53e3e;
+            --border-cell-failed: #d73a49;
+            
+            --shadow: rgba(0, 0, 0, 0.1);
+        }
+
+        :root[data-theme="dark"] {
+            --bg-primary: #0a0a0a;
+            --bg-secondary: #121212;
+            --bg-tertiary: #181818;
+            --bg-code: #0d0d0d;
+            --bg-error: #1a0f0f;
+            --bg-artifact: #151515;
+            --bg-artifact-hover: #1a1a1a;
+            
+            --text-primary: #e0e0e0;
+            --text-secondary: #888888;
+            --text-error: #ff6b6b;
+            --text-link: #64b5f6;
+            
+            --border-primary: #2a2a2a;
+            --border-error: #ff6b6b;
+            --border-cell-failed: #ff6b6b;
+            
+            --shadow: rgba(255, 255, 255, 0.05);
+        }
+        /* Monocolor UI theme: black/white background, all text/borders single blue */
+        :root[data-ui="monocolor"] { --mono-color: #0a66ff; }
+        :root[data-ui="monocolor"][data-theme="light"] {
+            --bg-primary: #ffffff;
+        }
+        :root[data-ui="monocolor"][data-theme="dark"] {
+            --bg-primary: #000000;
+        }
+        :root[data-ui="monocolor"] {
+            --bg-secondary: var(--bg-primary);
+            --bg-tertiary: var(--bg-primary);
+            --bg-code: var(--bg-primary);
+            --bg-error: var(--bg-primary);
+            --bg-artifact: var(--bg-primary);
+            --bg-artifact-hover: var(--bg-primary);
+
+            --text-primary: var(--mono-color);
+            --text-secondary: var(--mono-color);
+            --text-error: var(--mono-color);
+            --text-link: var(--mono-color);
+
+            --border-primary: var(--mono-color);
+            --border-error: var(--mono-color);
+            --border-cell-failed: var(--mono-color);
+
+            --shadow: none;
+        }
+        :root[data-ui="monocolor"] a { color: var(--mono-color); }
+        :root[data-ui="monocolor"] .menu-button,
+        :root[data-ui="monocolor"] .theme-toggle,
+        :root[data-ui="monocolor"] .reset-toggle,
+        :root[data-ui="monocolor"] .back-button { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); }
+        :root[data-ui="monocolor"] .menu-button:hover,
+        :root[data-ui="monocolor"] .theme-toggle:hover,
+        :root[data-ui="monocolor"] .reset-toggle:hover,
+        :root[data-ui="monocolor"] .back-button:hover { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); }
+        :root[data-ui="monocolor"] .menu-dropdown { background: var(--bg-primary); border-color: var(--mono-color); box-shadow: none; }
+        :root[data-ui="monocolor"] .menu-item { color: var(--mono-color); border-bottom-color: var(--mono-color); }
+        :root[data-ui="monocolor"] .system-info { background: var(--bg-primary); border-color: var(--mono-color); }
+        :root[data-ui="monocolor"] .cell { border-color: var(--mono-color); background: var(--bg-primary); }
+        :root[data-ui="monocolor"] .cell-header { background: var(--bg-primary); border-bottom-color: var(--mono-color); }
+        :root[data-ui="monocolor"] .artifact { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); }
+        :root[data-ui="monocolor"] .artifact:hover { background: var(--bg-primary); }
+        :root[data-ui="monocolor"] .artifact-preview img,
+        :root[data-ui="monocolor"] .artifact-preview svg { border-color: var(--mono-color); }
+        :root[data-ui="monocolor"] .status-widget { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); }
+        :root[data-ui="monocolor"] .minimap,
+        :root[data-ui="monocolor"] .file-explorer,
+        :root[data-ui="monocolor"] .tools-widget {
+            background: var(--bg-primary);
+            border-color: var(--mono-color);
+            color: var(--mono-color);
+        }
+        :root[data-ui="monocolor"] .cell-code {
+            background: var(--bg-primary);
+            border-bottom-color: var(--mono-color);
+        }
+        :root[data-ui="monocolor"] .tools-title,
+        :root[data-ui="monocolor"] .file-explorer-section-title,
+        :root[data-ui="monocolor"] .minimap-title { color: var(--mono-color); border-bottom-color: var(--mono-color); }
+        :root[data-ui="monocolor"] .tool-button { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); }
+        :root[data-ui="monocolor"] .tool-button.active { border-color: var(--mono-color); }
+        :root[data-ui="monocolor"] .file-explorer-item,
+        :root[data-ui="monocolor"] .minimap-item { color: var(--mono-color); }
+        /* Force Pygments code to mono blue on mono bg */
+        :root[data-ui="monocolor"] .highlight { background: var(--bg-primary) !important; color: var(--mono-color) !important; }
+        :root[data-ui="monocolor"] .highlight *,
+        :root[data-ui="monocolor"] .highlight .hll { color: var(--mono-color) !important; background: transparent !important; border-color: var(--mono-color) !important; }
+        /* Default code font + metrics (overridable via frontmatter) */
+        :root { --code-font-size: 0.95rem; --code-line-height: 1.5; --code-pad-y: 0.75rem; }
+        /* Minimal UI theme overrides base variables for a flatter, 90s look */
+        :root[data-ui="none"] {
+            --bg-primary: #ffffff;
+            --bg-secondary: transparent;
+            --bg-tertiary: transparent;
+            --bg-code: #f9f9f9;
+            --bg-error: #fff0f0;
+            --bg-artifact: #f0f7ff;
+            --bg-artifact-hover: #e5f1ff;
+
+            --text-primary: #000000;
+            --text-secondary: #222222;
+            --text-error: #a00000;
+            --text-link: #0000ee;
+
+            --border-primary: #cccccc;
+            --border-error: #cc0000;
+            --border-cell-failed: #cc0000;
+
+            --shadow: none;
+        }
+        html {
+            overscroll-behavior: none;
+        }
+        body {
+            font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace;
+            line-height: 1.4;
+            max-width: 1000px;
+            margin: 0 auto;
+            padding: 15px;
+            color: var(--text-primary);
+            background: var(--bg-primary);
+            transition: background-color 0.2s ease, color 0.2s ease;
+            overscroll-behavior: none;
+        }
+        /* Minimal "none" UI theme overrides */
+        :root[data-ui="none"] body {
+            font-family: 'Times New Roman', Times, serif;
+            line-height: 1.5;
+            max-width: 860px;
+            padding: 12px;
+            background: #ffffff;
+            color: #000000;
+            transition: none;
+        }
+        
+        /* Two panel layout removed */
+        
+        .controls {
+            position: fixed;
+            top: 20px;
+            right: 20px;
+            display: flex;
+            flex-direction: column;
+            align-items: flex-end;
+            gap: 0.25rem;
+            z-index: 1000;
+        }
+        .controls-buttons { display: flex; gap: 0.5rem; }
+        
+        .menu-button {
+            position: relative;
+            background: var(--bg-secondary);
+            border: 1px solid var(--border-primary);
+            padding: 8px 12px;
+            border-radius: 2px;
+            color: var(--text-secondary);
+            cursor: pointer;
+            font-family: inherit;
+            font-size: 0.9rem;
+            user-select: none;
+        }
+        /* Keep default control styling when widgets are enabled, even in minimal UI */
+        :root[data-ui="none"][data-widgets="on"] .menu-button,
+        :root[data-ui="none"][data-widgets="on"] .theme-toggle,
+        :root[data-ui="none"][data-widgets="on"] .reset-toggle,
+        :root[data-ui="none"][data-widgets="on"] .back-button {
+            background: #f6f6f6;
+            border: 1px solid #cccccc;
+            color: #222222;
+        }
+        
+        .menu-button:hover {
+            color: var(--text-primary);
+            background: var(--bg-tertiary);
+        }
+        /* Controls state indicator (top-right) */
+        /* Status widget (bottom-right) */
+        .status-widget {
+            position: fixed;
+            right: 20px;
+            bottom: 20px;
+            width: auto;
+            max-width: 260px;
+            background: var(--bg-secondary);
+            border: 1px solid var(--border-primary);
+            border-radius: 2px;
+            padding: 6px 8px;
+            font-size: 0.8rem;
+            color: var(--text-secondary);
+            z-index: 100;
+        }
+        .status-widget strong { color: var(--text-primary); }
+        :root[data-ui="none"][data-widgets="on"] .status-widget { background: #f6f6f6; border-color: #ccc; color: #222; }
+        :root[data-ui="none"][data-widgets="on"] .menu-button:hover,
+        :root[data-ui="none"][data-widgets="on"] .theme-toggle:hover,
+        :root[data-ui="none"][data-widgets="on"] .reset-toggle:hover,
+        :root[data-ui="none"][data-widgets="on"] .back-button:hover {
+            background: #ededed;
+            border-color: #bbbbbb;
+            color: #000000;
+        }
+        
+        .menu-dropdown {
+            position: absolute;
+            top: 100%;
+            right: 0;
+            background: var(--bg-secondary);
+            border: 1px solid var(--border-primary);
+            border-radius: 4px;
+            box-shadow: 0 4px 12px var(--shadow);
+            min-width: 160px;
+            opacity: 0;
+            visibility: hidden;
+            transform: translateY(-8px);
+            transition: all 0.2s ease;
+            z-index: 1001;
+            margin-top: 4px;
+        }
+        :root[data-ui="none"][data-widgets="on"] .menu-dropdown { background: #ffffff; border: 1px solid #cccccc; box-shadow: none; }
+        
+        .menu-button.active .menu-dropdown {
+            opacity: 1;
+            visibility: visible;
+            transform: translateY(0);
+        }
+        
+        .menu-item {
+            display: block;
+            padding: 8px 12px;
+            color: var(--text-secondary);
+            text-decoration: none;
+            font-size: 0.85rem;
+            border-bottom: 1px solid var(--border-primary);
+            cursor: pointer;
+        }
+        :root[data-ui="none"] .menu-item { color: #000; border-bottom: 1px solid #eee; }
+        
+        .menu-item:last-child {
+            border-bottom: none;
+        }
+        
+        .menu-item:hover {
+            background: var(--bg-tertiary);
+            color: var(--text-primary);
+        }
+        
+        .menu-checkbox {
+            display: inline-block;
+            width: 16px;
+            font-family: monospace;
+            color: var(--text-link);
+        }
+        
+        .theme-toggle,
+        .reset-toggle,
+        .back-button {
+            background: var(--bg-secondary);
+            border: 1px solid var(--border-primary);
+            padding: 8px 12px;
+            border-radius: 4px;
+            color: var(--text-secondary);
+            cursor: pointer;
+            font-family: inherit;
+            font-size: 0.9rem;
+            user-select: none;
+        }
+
+        .back-button {
+            text-decoration: none;
+            display: inline-block;
+        }
+
+        .theme-toggle:hover,
+        .reset-toggle:hover,
+        .back-button:hover {
+            color: var(--text-primary);
+            background: var(--bg-tertiary);
+        }
+        
+        .system-info {
+            background: var(--bg-secondary);
+            border: 1px solid var(--border-primary);
+            border-radius: 4px;
+            padding: 8px 12px;
+            margin-bottom: 16px;
+            font-size: 0.85em;
+            color: var(--text-secondary);
+        }
+        
+        .system-info-header {
+            font-weight: 600;
+            color: var(--text-primary);
+            margin-bottom: 2px;
+        }
+        
+        .system-info-content {
+            font-family: monospace;
+        }
+        
+        .theme-toggle, .reset-toggle {
+            background: var(--bg-secondary);
+            border: 1px solid var(--border-primary);
+            border-radius: 2px;
+            /* padding: 0.4rem 0.6rem; */
+            cursor: pointer;
+            font-family: inherit;
+            font-size: 0.8rem;
+            color: var(--text-secondary);
+            user-select: none;
+            transition: all 0.2s ease;
+            text-transform: lowercase;
+            letter-spacing: 0;
+        }
+        
+        .theme-toggle:hover, .reset-toggle:hover {
+            background: var(--bg-tertiary);
+            border-color: var(--text-secondary);
+            color: var(--text-primary);
+        }
+        
+        .minimap {
+            position: fixed;
+            bottom: 20px;
+            right: 20px;
+            width: 220px;
+            max-height: 400px;
+            background: var(--bg-secondary);
+            border: 1px solid var(--border-primary);
+            border-radius: 2px;
+            padding: 0.5rem;
+            font-size: 0.7rem;
+            overflow-y: auto;
+            z-index: 100;
+            opacity: 0.9;
+            transition: opacity 0.2s ease;
+        }
+        /* Hide widgets and controls when disabled via frontmatter */
+        :root[data-widgets="off"] .controls,
+        :root[data-widgets="off"] .minimap,
+        :root[data-widgets="off"] .file-explorer,
+        :root[data-widgets="off"] .tools-widget,
+        :root[data-widgets="off"] .status-widget { display: none !important; }
+        
+        .file-explorer {
+            position: fixed;
+            bottom: 20px; /* default; JS will stack */
+            right: 20px;
+            left: auto;
+            top: auto;
+            width: 220px;
+            max-height: 400px;
+            background: var(--bg-secondary);
+            border: 1px solid var(--border-primary);
+            border-radius: 2px;
+            padding: 0.5rem;
+            font-size: 0.7rem;
+            overflow-y: auto;
+            z-index: 100;
+            opacity: 0.9;
+            transition: opacity 0.2s ease;
+        }
+
+        /* Drawing overlay */
+        .draw-overlay {
+            position: fixed;
+            top: 0;
+            left: 0;
+            width: 100vw;
+            height: 100vh;
+            z-index: 80; /* under widgets (100) and controls (1000) */
+            display: block;
+            pointer-events: none; /* enabled only when a tool is active */
+        }
+
+        /* Tools widget */
+        .tools-widget {
+            position: fixed;
+            bottom: 20px; /* default; JS will stack */
+            right: 20px;
+            left: auto;
+            top: auto;
+            width: 220px;
+            background: var(--bg-secondary);
+            border: 1px solid var(--border-primary);
+            border-radius: 2px;
+            padding: 0.5rem;
+            font-size: 0.7rem;
+            z-index: 100;
+            opacity: 0.95;
+        }
+        .tools-title {
+            font-weight: bold;
+            color: var(--text-secondary);
+            margin-bottom: 0.5rem;
+            padding-bottom: 0.25rem;
+            border-bottom: 1px solid var(--border-primary);
+            cursor: grab;
+            user-select: none;
+        }
+        .tools-row { display: flex; gap: 0.4rem; flex-wrap: wrap; }
+        .tool-button {
+            background: var(--bg-tertiary);
+            border: 1px solid var(--border-primary);
+            border-radius: 2px;
+            padding: 0.25rem 0.4rem;
+            cursor: pointer;
+            color: var(--text-secondary);
+            font-family: inherit;
+            font-size: 0.75rem;
+            user-select: none;
+        }
+        .tool-button:hover { color: var(--text-primary); }
+        .tool-button.active { color: var(--text-primary); border-color: var(--text-secondary); background: var(--bg-secondary); }
+        
+        .minimap:hover, .file-explorer:hover {
+            opacity: 1;
+        }
+        
+        .minimap-title {
+            font-weight: bold;
+            color: var(--text-secondary);
+            margin-bottom: 0.5rem;
+            padding-bottom: 0.25rem;
+            border-bottom: 1px solid var(--border-primary);
+            cursor: grab; /* drag handle */
+            user-select: none;
+        }
+        
+        .minimap-item {
+            display: block;
+            color: var(--text-secondary);
+            text-decoration: none;
+            padding: 0.15rem 0;
+            border-left: 2px solid transparent;
+            padding-left: 0.5rem;
+            transition: all 0.2s ease;
+            cursor: pointer;
+        }
+        
+        .minimap-item:hover {
+            color: var(--text-primary);
+            border-left-color: var(--text-secondary);
+        }
+        
+        .minimap-item.active {
+            color: var(--text-primary);
+            border-left-color: var(--text-link);
+        }
+        
+        .minimap-heading {
+            font-weight: normal;
+        }
+        
+        .minimap-heading.h1 { padding-left: 0.5rem; }
+        .minimap-heading.h2 { padding-left: 1rem; }
+        .minimap-heading.h3 { padding-left: 1.5rem; }
+        .minimap-heading.h4 { padding-left: 2rem; }
+        .minimap-heading.h5 { padding-left: 2.5rem; }
+        .minimap-heading.h6 { padding-left: 3rem; }
+        
+        .minimap-cell {
+            color: var(--text-link);
+            opacity: 0.8;
+            font-style: italic;
+        }
+        
+        .minimap-cell:hover {
+            opacity: 1;
+        }
+        
+        .file-explorer-title {
+            font-weight: bold;
+            color: var(--text-secondary);
+            margin-bottom: 0.5rem;
+            padding-bottom: 0.25rem;
+            border-bottom: 1px solid var(--border-primary);
+            cursor: grab; /* drag handle */
+            user-select: none;
+        }
+        
+        .file-explorer-section {
+            margin-bottom: 0.75rem;
+        }
+        
+        .file-explorer-section-title {
+            font-weight: bold;
+            color: var(--text-secondary);
+            font-size: 0.65rem;
+            margin-bottom: 0.25rem;
+            text-transform: uppercase;
+            letter-spacing: 0.5px;
+        }
+        
+        .file-explorer-item {
+            display: block;
+            color: var(--text-secondary);
+            text-decoration: none;
+            padding: 0.1rem 0;
+            margin-left: 0.5rem;
+            transition: color 0.2s ease;
+            cursor: pointer;
+            font-family: monospace;
+        }
+        
+        .file-explorer-item:hover {
+            color: var(--text-primary);
+        }
+        
+        .file-explorer-item.script {
+            color: var(--text-link);
+        }
+        
+        .file-explorer-item.artifact {
+            color: var(--text-secondary);
+            opacity: 0.8;
+        }
+        
+
+        /* Hide widgets on smaller screens */
+        @media (max-width: 768px) {
+            .minimap, .file-explorer, .tools-widget {
+                display: none;
+            }
+        }
+        
+        .cell {
+            margin: 1rem 0;
+            border: 1px solid var(--border-primary);
+            border-radius: 2px;
+            overflow: hidden;
+            background: var(--bg-secondary);
+        }
+        :root[data-ui="none"] .cell { margin: 1em 0; border: none; background: transparent; }
+        .cell-header {
+            background: var(--bg-secondary);
+            padding: 0.5rem 1rem;
+            border-bottom: 1px solid var(--border-primary);
+            font-family: inherit;
+            font-size: 0.85rem;
+        }
+        :root[data-ui="none"] .cell-header { background: transparent; border: none; padding: 0; font-weight: bold; }
+        :root[data-ui="none"] .cell-content { padding: 0; }
+        :root[data-ui="none"] .copy-button,
+        :root[data-ui="none"] .collapse-indicators,
+        :root[data-ui="none"] .cell-meta,
+        :root[data-ui="none"] .cell-outputs-header { display: none !important; }
+        :root[data-ui="none"] pre, :root[data-ui="none"] code { font-family: Menlo, Monaco, 'Courier New', monospace; }
+        :root[data-ui="none"] .code-content pre { background: #f9f9f9; border: 1px solid #ddd; padding: 8px; }
+        :root[data-ui="none"] .output { background: transparent; border: none; padding: 0.25em 0; }
+            color: var(--text-secondary);
+            cursor: pointer;
+            user-select: none;
+            transition: background-color 0.2s ease;
+        }
+        .cell-header:hover {
+            background: var(--bg-tertiary);
+        }
+        .collapse-indicators {
+            color: var(--text-secondary);
+            font-size: 0.8rem;
+            opacity: 0.7;
+        }
+        .collapse-indicators span:hover {
+            color: var(--text-primary);
+            opacity: 1;
+        }
+        .cell-code {
+            display: block;
+            background: var(--bg-code);
+        }
+        .cell-code.collapsed {
+            display: none;
+        }
+        .cell-code pre {
+            margin: 0;
+            padding: 0.75rem;
+            background: var(--bg-code);
+            overflow-x: auto;
+            color: var(--text-primary);
+        }
+        .cell-output {
+            padding: 0.75rem;
+            /* background: var(--bg-primary); */
+            background: var(--bg-secondary);
+        }
+        .cell-output.collapsed {
+            display: none;
+        }
+        .cell-stdout {
+            background: var(--bg-tertiary);
+            padding: 0.75rem;
+            border-radius: 1px;
+            /* margin: 0.25rem 0; */
+            font-family: inherit;
+            font-size: 0.9rem;
+            white-space: pre-wrap;
+            color: var(--text-primary);
+        }
+
+        .cell-stdout {
+            background: var(--bg-tertiary);
+            padding: 0.75rem;
+            border-radius: 1px;
+            font-family: inherit;
+            font-size: 0.9rem;
+            color: var(--text-primary);
+
+            /* key bits */
+            overflow: auto;          /* show scrollbars when needed */
+            max-width: 100%;         /* respects whatever layout width you give it */
+        }
+
+        .cell-stdout .stdout-text {
+            margin: 0;               /* reset pre default margin */
+            white-space: pre;        /* keep line breaks, NO wrapping */
+            display: inline-block;   /* shrink-to-content */
+            min-width: max-content;  /* allow very long lines to define intrinsic width */
+            font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
+            tab-size: 2;
+        }
+
+        .cell-stderr {
+            background: var(--bg-error);
+            border-left: 2px solid var(--border-error);
+            padding: 1rem;
+            margin: 0.5rem 0;
+            font-family: inherit;
+            font-size: 0.9rem;
+            color: var(--text-error);
+            white-space: pre-wrap;
+        }
+        .uv-install-logs {
+            margin: 0.5rem 0;
+        }
+        .uv-logs-header {
+            cursor: pointer;
+            padding: 0.75rem;
+            border-left: 3px solid var(--border-color);
+            font-family: inherit;
+            font-size: 0.85rem;
+            color: var(--text-secondary);
+            user-select: none;
+        }
+        .uv-logs-content {
+            background: var(--bg-secondary);
+            padding: 1rem;
+            border-left: 3px solid var(--border-color);
+            white-space: pre-wrap;
+            font-family: monospace;
+            font-size: 0.85rem;
+            color: var(--text-secondary);
+            overflow-x: auto;
+        }
+        .cell-artifacts {
+            margin: 1rem 0;
+        }
+        .cell-artifacts h4 {
+            margin: 0 0 0.5rem 0;
+            color: var(--text-secondary);
+            font-size: 0.9rem;
+        }
+        .artifact {
+            display: inline-block;
+            background: var(--bg-artifact);
+            padding: 0.25rem 0.5rem;
+            border-radius: 1px;
+            margin: 0.25rem 0.5rem 0.25rem 0;
+            font-family: inherit;
+            font-size: 0.8rem;
+            color: var(--text-link);
+            text-decoration: none;
+            transition: background-color 0.2s ease;
+            border: 1px solid var(--border-primary);
+        }
+        .artifact:hover {
+            background: var(--bg-artifact-hover);
+        }
+        .artifact-preview {
+            margin-top: 1rem;
+        }
+        .artifact-preview img {
+            max-width: 100%;
+            height: auto;
+            border: 1px solid var(--border-primary);
+            border-radius: 1px;
+        }
+        .artifact-preview svg {
+            max-width: 100%;
+            height: auto;
+            border: 1px solid var(--border-primary);
+            border-radius: 1px;
+            display: block;
+        }
+        /* Style SVG text elements */
+        .artifact-preview svg g {
+            fill: var(--text-primary) !important;
+        }
+        /* Auto-theme SVG elements */
+        .artifact-preview svg {
+            background: transparent;
+        }
+        /* CSV table styling */
+        .artifact-csv {
+            margin-top: 1rem;
+            overflow-x: auto;
+        }
+        .csv-table {
+            width: 100%;
+            border-collapse: collapse;
+            font-size: 0.9rem;
+            background: var(--bg-secondary);
+            border: 1px solid var(--border-primary);
+            border-radius: 1px;
+        }
+        .csv-table th,
+        .csv-table td {
+            padding: 0.5rem 0.75rem;
+            text-align: left;
+            border: 1px solid var(--border-primary);
+        }
+        .csv-table th {
+            background: var(--bg-tertiary);
+            font-weight: 600;
+            color: var(--text-primary);
+        }
+        .csv-table tbody tr:hover {
+            background: var(--bg-artifact-hover);
+        }
+        .artifact-csv-error {
+            margin-top: 1rem;
+            padding: 1rem;
+            background: var(--bg-error);
+            color: var(--text-error);
+            border: 1px solid var(--border-error);
+            border-radius: 1px;
+        }
+        .cell-failed {
+            border-color: var(--border-cell-failed);
+        }
+        .cell-failed .cell-header {
+            background: var(--bg-error);
+            color: var(--text-error);
+        }
+        .cell-commented {
+            opacity: 0.6;
+            border-style: dashed;
+        }
+        .cell-commented .cell-header {
+            background: var(--bg-secondary);
+            color: var(--text-secondary);
+            font-style: italic;
+        }
+        .run-btn {
+            background: var(--bg-tertiary);
+            border: 1px solid var(--border-primary);
+            padding: 2px 6px;
+            border-radius: 2px;
+            color: var(--text-secondary);
+            cursor: pointer;
+            font-size: 0.75em;
+            font-family: inherit;
+            margin-left: 4px;
+        }
+        .run-btn:hover {
+            color: var(--text-primary);
+            background: var(--bg-primary);
+        }
+        .run-btn:disabled {
+            opacity: 0.6;
+            cursor: not-allowed;
+        }
+        .copy-btn {
+            background: var(--bg-tertiary);
+            border: 1px solid var(--border-primary);
+            padding: 2px 6px;
+            border-radius: 2px;
+            color: var(--text-secondary);
+            cursor: pointer;
+            font-size: 0.75em;
+            font-family: inherit;
+            margin-left: 4px;
+        }
+        .copy-btn:hover {
+            color: var(--text-primary);
+            background: var(--bg-primary);
+        }
+        .copy-btn:disabled {
+            opacity: 0.6;
+            cursor: not-allowed;
+        }
+        .copy-btn.copied {
+            color: #4caf50;
+            background: var(--bg-primary);
+            border-color: #4caf50;
+            transition: all 0.2s ease;
+        }
+        .raw-btn {
+            background: var(--bg-tertiary);
+            border: 1px solid var(--border-primary);
+            padding: 2px 6px;
+            border-radius: 2px;
+            color: var(--text-secondary);
+            cursor: pointer;
+            font-size: 0.75em;
+            font-family: inherit;
+            margin-left: 4px;
+            text-decoration: none;
+            display: inline-block;
+        }
+        .raw-btn:hover {
+            color: var(--text-primary);
+            background: var(--bg-primary);
+            text-decoration: none;
+        }
+        .github-btn {
+            background: var(--bg-tertiary);
+            border: 1px solid var(--border-primary);
+            padding: 2px 6px;
+            border-radius: 2px;
+            color: var(--text-secondary);
+            cursor: pointer;
+            font-size: 0.75em;
+            font-family: inherit;
+            margin-left: 4px;
+            text-decoration: none;
+            display: inline-block;
+        }
+        .github-btn:hover {
+            color: var(--text-primary);
+            background: var(--bg-primary);
+            text-decoration: none;
+        }
+        .hf-btn {
+            background: var(--bg-tertiary);
+            border: 1px solid var(--border-primary);
+            padding: 2px 6px;
+            border-radius: 2px;
+            color: var(--text-secondary);
+            cursor: pointer;
+            font-size: 0.75em;
+            font-family: inherit;
+            margin-left: 4px;
+            text-decoration: none;
+            display: inline-block;
+        }
+        .hf-btn:hover {
+            color: var(--text-primary);
+            background: var(--bg-primary);
+            text-decoration: none;
+        }
+        .output-stale {
+            opacity: 0.5;
+            position: relative;
+        }
+        .output-stale::after {
+            content: '⏳ updating...';
+            position: absolute;
+            top: 8px;
+            right: 8px;
+            background: var(--bg-secondary);
+            padding: 4px 8px;
+            border-radius: 2px;
+            font-size: 0.75em;
+            color: var(--text-secondary);
+            border: 1px solid var(--border-primary);
+        }
+        h1, h2, h3, h4, h5, h6 {
+            margin-top: 1.5rem;
+            margin-bottom: 0.75rem;
+            color: var(--text-primary);
+        }
+        h1 {
+            margin-top: 0;
+            margin-bottom: 1rem;
+        }
+        p {
+            margin: 0.75rem 0;
+            color: var(--text-primary);
+        }
+        a {
+            color: var(--text-link);
+        }
+        img {
+            max-width: 100%;
+            height: auto;
+            border-radius: 1px;
+            box-shadow: none;
+        }
+        pre, code {
+            font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace;
+            font-size: var(--code-font-size);
+        }
+        .code-wrap { position: relative; }
+        .code-line-highlight { display: none; position: absolute; left: 0; right: 0; height: 1.5em; background: rgba(255, 235, 170, 0.35); pointer-events: none; border-left: 3px solid #f4c542; }
+        .line-number { cursor: pointer; text-decoration: none; color: var(--text-secondary); padding: 0 0.25rem; }
+        .line-number.selected { background: rgba(255, 235, 170, 0.4); color: var(--text-primary); }
+        
+        /* Line numbers */
+        .highlight-with-lines {
+            display: flex;
+        }
+        .line-numbers {
+            background: var(--bg-tertiary);
+            padding: var(--code-pad-y) 0.5rem;
+            font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace;
+            font-size: var(--code-font-size);
+            line-height: var(--code-line-height);
+            color: var(--text-secondary);
+            user-select: none;
+            text-align: right;
+            border-right: 1px solid var(--border-primary);
+        }
+        .line-numbers .line-number {
+            display: block;
+            line-height: var(--code-line-height);
+        }
+        .highlight-with-lines .highlight {
+            flex: 1;
+        }
+        .highlight .hll { background-color: transparent; } /* don't conflict with our highlight */
+        .highlight pre {
+            white-space: pre;
+            margin: 0;
+            padding: var(--code-pad-y) 0.75rem;
+            line-height: var(--code-line-height);
+        }
+        
+        /* Collapsed code styling */
+        .cell-code.collapsed {
+            display: none;
+        }
+        .cell-code.expanded {
+            display: block;
+        }
+        
+        .cell-code {
+            display: block;
+            border-bottom: 1px solid var(--border-primary);
+        }
+        
+        
+        pre { line-height: 125%; }
+td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; }
+span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; }
+td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; }
+span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; }
+[data-theme="light"] .highlight .hll { background-color: #ffffcc }
+[data-theme="light"] .highlight { background: #f8f8f8; }
+[data-theme="light"] .highlight .c { color: #3D7B7B; font-style: italic } /* Comment */
+[data-theme="light"] .highlight .err { border: 1px solid #F00 } /* Error */
+[data-theme="light"] .highlight .k { color: #008000; font-weight: bold } /* Keyword */
+[data-theme="light"] .highlight .o { color: #666 } /* Operator */
+[data-theme="light"] .highlight .ch { color: #3D7B7B; font-style: italic } /* Comment.Hashbang */
+[data-theme="light"] .highlight .cm { color: #3D7B7B; font-style: italic } /* Comment.Multiline */
+[data-theme="light"] .highlight .cp { color: #9C6500 } /* Comment.Preproc */
+[data-theme="light"] .highlight .cpf { color: #3D7B7B; font-style: italic } /* Comment.PreprocFile */
+[data-theme="light"] .highlight .c1 { color: #3D7B7B; font-style: italic } /* Comment.Single */
+[data-theme="light"] .highlight .cs { color: #3D7B7B; font-style: italic } /* Comment.Special */
+[data-theme="light"] .highlight .gd { color: #A00000 } /* Generic.Deleted */
+[data-theme="light"] .highlight .ge { font-style: italic } /* Generic.Emph */
+[data-theme="light"] .highlight .ges { font-weight: bold; font-style: italic } /* Generic.EmphStrong */
+[data-theme="light"] .highlight .gr { color: #E40000 } /* Generic.Error */
+[data-theme="light"] .highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */
+[data-theme="light"] .highlight .gi { color: #008400 } /* Generic.Inserted */
+[data-theme="light"] .highlight .go { color: #717171 } /* Generic.Output */
+[data-theme="light"] .highlight .gp { color: #000080; font-weight: bold } /* Generic.Prompt */
+[data-theme="light"] .highlight .gs { font-weight: bold } /* Generic.Strong */
+[data-theme="light"] .highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */
+[data-theme="light"] .highlight .gt { color: #04D } /* Generic.Traceback */
+[data-theme="light"] .highlight .kc { color: #008000; font-weight: bold } /* Keyword.Constant */
+[data-theme="light"] .highlight .kd { color: #008000; font-weight: bold } /* Keyword.Declaration */
+[data-theme="light"] .highlight .kn { color: #008000; font-weight: bold } /* Keyword.Namespace */
+[data-theme="light"] .highlight .kp { color: #008000 } /* Keyword.Pseudo */
+[data-theme="light"] .highlight .kr { color: #008000; font-weight: bold } /* Keyword.Reserved */
+[data-theme="light"] .highlight .kt { color: #B00040 } /* Keyword.Type */
+[data-theme="light"] .highlight .m { color: #666 } /* Literal.Number */
+[data-theme="light"] .highlight .s { color: #BA2121 } /* Literal.String */
+[data-theme="light"] .highlight .na { color: #687822 } /* Name.Attribute */
+[data-theme="light"] .highlight .nb { color: #008000 } /* Name.Builtin */
+[data-theme="light"] .highlight .nc { color: #00F; font-weight: bold } /* Name.Class */
+[data-theme="light"] .highlight .no { color: #800 } /* Name.Constant */
+[data-theme="light"] .highlight .nd { color: #A2F } /* Name.Decorator */
+[data-theme="light"] .highlight .ni { color: #717171; font-weight: bold } /* Name.Entity */
+[data-theme="light"] .highlight .ne { color: #CB3F38; font-weight: bold } /* Name.Exception */
+[data-theme="light"] .highlight .nf { color: #00F } /* Name.Function */
+[data-theme="light"] .highlight .nl { color: #767600 } /* Name.Label */
+[data-theme="light"] .highlight .nn { color: #00F; font-weight: bold } /* Name.Namespace */
+[data-theme="light"] .highlight .nt { color: #008000; font-weight: bold } /* Name.Tag */
+[data-theme="light"] .highlight .nv { color: #19177C } /* Name.Variable */
+[data-theme="light"] .highlight .ow { color: #A2F; font-weight: bold } /* Operator.Word */
+[data-theme="light"] .highlight .w { color: #BBB } /* Text.Whitespace */
+[data-theme="light"] .highlight .mb { color: #666 } /* Literal.Number.Bin */
+[data-theme="light"] .highlight .mf { color: #666 } /* Literal.Number.Float */
+[data-theme="light"] .highlight .mh { color: #666 } /* Literal.Number.Hex */
+[data-theme="light"] .highlight .mi { color: #666 } /* Literal.Number.Integer */
+[data-theme="light"] .highlight .mo { color: #666 } /* Literal.Number.Oct */
+[data-theme="light"] .highlight .sa { color: #BA2121 } /* Literal.String.Affix */
+[data-theme="light"] .highlight .sb { color: #BA2121 } /* Literal.String.Backtick */
+[data-theme="light"] .highlight .sc { color: #BA2121 } /* Literal.String.Char */
+[data-theme="light"] .highlight .dl { color: #BA2121 } /* Literal.String.Delimiter */
+[data-theme="light"] .highlight .sd { color: #BA2121; font-style: italic } /* Literal.String.Doc */
+[data-theme="light"] .highlight .s2 { color: #BA2121 } /* Literal.String.Double */
+[data-theme="light"] .highlight .se { color: #AA5D1F; font-weight: bold } /* Literal.String.Escape */
+[data-theme="light"] .highlight .sh { color: #BA2121 } /* Literal.String.Heredoc */
+[data-theme="light"] .highlight .si { color: #A45A77; font-weight: bold } /* Literal.String.Interpol */
+[data-theme="light"] .highlight .sx { color: #008000 } /* Literal.String.Other */
+[data-theme="light"] .highlight .sr { color: #A45A77 } /* Literal.String.Regex */
+[data-theme="light"] .highlight .s1 { color: #BA2121 } /* Literal.String.Single */
+[data-theme="light"] .highlight .ss { color: #19177C } /* Literal.String.Symbol */
+[data-theme="light"] .highlight .bp { color: #008000 } /* Name.Builtin.Pseudo */
+[data-theme="light"] .highlight .fm { color: #00F } /* Name.Function.Magic */
+[data-theme="light"] .highlight .vc { color: #19177C } /* Name.Variable.Class */
+[data-theme="light"] .highlight .vg { color: #19177C } /* Name.Variable.Global */
+[data-theme="light"] .highlight .vi { color: #19177C } /* Name.Variable.Instance */
+[data-theme="light"] .highlight .vm { color: #19177C } /* Name.Variable.Magic */
+[data-theme="light"] .highlight .il { color: #666 } /* Literal.Number.Integer.Long */
+
+pre { line-height: 125%; }
+td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; }
+span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; }
+td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; }
+span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; }
+[data-theme="dark"] .highlight .hll { background-color: #49483e }
+[data-theme="dark"] .highlight { background: #272822; color: #F8F8F2 }
+[data-theme="dark"] .highlight .c { color: #959077 } /* Comment */
+[data-theme="dark"] .highlight .err { color: #ED007E; background-color: #1E0010 } /* Error */
+[data-theme="dark"] .highlight .esc { color: #F8F8F2 } /* Escape */
+[data-theme="dark"] .highlight .g { color: #F8F8F2 } /* Generic */
+[data-theme="dark"] .highlight .k { color: #66D9EF } /* Keyword */
+[data-theme="dark"] .highlight .l { color: #AE81FF } /* Literal */
+[data-theme="dark"] .highlight .n { color: #F8F8F2 } /* Name */
+[data-theme="dark"] .highlight .o { color: #FF4689 } /* Operator */
+[data-theme="dark"] .highlight .x { color: #F8F8F2 } /* Other */
+[data-theme="dark"] .highlight .p { color: #F8F8F2 } /* Punctuation */
+[data-theme="dark"] .highlight .ch { color: #959077 } /* Comment.Hashbang */
+[data-theme="dark"] .highlight .cm { color: #959077 } /* Comment.Multiline */
+[data-theme="dark"] .highlight .cp { color: #959077 } /* Comment.Preproc */
+[data-theme="dark"] .highlight .cpf { color: #959077 } /* Comment.PreprocFile */
+[data-theme="dark"] .highlight .c1 { color: #959077 } /* Comment.Single */
+[data-theme="dark"] .highlight .cs { color: #959077 } /* Comment.Special */
+[data-theme="dark"] .highlight .gd { color: #FF4689 } /* Generic.Deleted */
+[data-theme="dark"] .highlight .ge { color: #F8F8F2; font-style: italic } /* Generic.Emph */
+[data-theme="dark"] .highlight .ges { color: #F8F8F2; font-weight: bold; font-style: italic } /* Generic.EmphStrong */
+[data-theme="dark"] .highlight .gr { color: #F8F8F2 } /* Generic.Error */
+[data-theme="dark"] .highlight .gh { color: #F8F8F2 } /* Generic.Heading */
+[data-theme="dark"] .highlight .gi { color: #A6E22E } /* Generic.Inserted */
+[data-theme="dark"] .highlight .go { color: #66D9EF } /* Generic.Output */
+[data-theme="dark"] .highlight .gp { color: #FF4689; font-weight: bold } /* Generic.Prompt */
+[data-theme="dark"] .highlight .gs { color: #F8F8F2; font-weight: bold } /* Generic.Strong */
+[data-theme="dark"] .highlight .gu { color: #959077 } /* Generic.Subheading */
+[data-theme="dark"] .highlight .gt { color: #F8F8F2 } /* Generic.Traceback */
+[data-theme="dark"] .highlight .kc { color: #66D9EF } /* Keyword.Constant */
+[data-theme="dark"] .highlight .kd { color: #66D9EF } /* Keyword.Declaration */
+[data-theme="dark"] .highlight .kn { color: #FF4689 } /* Keyword.Namespace */
+[data-theme="dark"] .highlight .kp { color: #66D9EF } /* Keyword.Pseudo */
+[data-theme="dark"] .highlight .kr { color: #66D9EF } /* Keyword.Reserved */
+[data-theme="dark"] .highlight .kt { color: #66D9EF } /* Keyword.Type */
+[data-theme="dark"] .highlight .ld { color: #E6DB74 } /* Literal.Date */
+[data-theme="dark"] .highlight .m { color: #AE81FF } /* Literal.Number */
+[data-theme="dark"] .highlight .s { color: #E6DB74 } /* Literal.String */
+[data-theme="dark"] .highlight .na { color: #A6E22E } /* Name.Attribute */
+[data-theme="dark"] .highlight .nb { color: #F8F8F2 } /* Name.Builtin */
+[data-theme="dark"] .highlight .nc { color: #A6E22E } /* Name.Class */
+[data-theme="dark"] .highlight .no { color: #66D9EF } /* Name.Constant */
+[data-theme="dark"] .highlight .nd { color: #A6E22E } /* Name.Decorator */
+[data-theme="dark"] .highlight .ni { color: #F8F8F2 } /* Name.Entity */
+[data-theme="dark"] .highlight .ne { color: #A6E22E } /* Name.Exception */
+[data-theme="dark"] .highlight .nf { color: #A6E22E } /* Name.Function */
+[data-theme="dark"] .highlight .nl { color: #F8F8F2 } /* Name.Label */
+[data-theme="dark"] .highlight .nn { color: #F8F8F2 } /* Name.Namespace */
+[data-theme="dark"] .highlight .nx { color: #A6E22E } /* Name.Other */
+[data-theme="dark"] .highlight .py { color: #F8F8F2 } /* Name.Property */
+[data-theme="dark"] .highlight .nt { color: #FF4689 } /* Name.Tag */
+[data-theme="dark"] .highlight .nv { color: #F8F8F2 } /* Name.Variable */
+[data-theme="dark"] .highlight .ow { color: #FF4689 } /* Operator.Word */
+[data-theme="dark"] .highlight .pm { color: #F8F8F2 } /* Punctuation.Marker */
+[data-theme="dark"] .highlight .w { color: #F8F8F2 } /* Text.Whitespace */
+[data-theme="dark"] .highlight .mb { color: #AE81FF } /* Literal.Number.Bin */
+[data-theme="dark"] .highlight .mf { color: #AE81FF } /* Literal.Number.Float */
+[data-theme="dark"] .highlight .mh { color: #AE81FF } /* Literal.Number.Hex */
+[data-theme="dark"] .highlight .mi { color: #AE81FF } /* Literal.Number.Integer */
+[data-theme="dark"] .highlight .mo { color: #AE81FF } /* Literal.Number.Oct */
+[data-theme="dark"] .highlight .sa { color: #E6DB74 } /* Literal.String.Affix */
+[data-theme="dark"] .highlight .sb { color: #E6DB74 } /* Literal.String.Backtick */
+[data-theme="dark"] .highlight .sc { color: #E6DB74 } /* Literal.String.Char */
+[data-theme="dark"] .highlight .dl { color: #E6DB74 } /* Literal.String.Delimiter */
+[data-theme="dark"] .highlight .sd { color: #E6DB74 } /* Literal.String.Doc */
+[data-theme="dark"] .highlight .s2 { color: #E6DB74 } /* Literal.String.Double */
+[data-theme="dark"] .highlight .se { color: #AE81FF } /* Literal.String.Escape */
+[data-theme="dark"] .highlight .sh { color: #E6DB74 } /* Literal.String.Heredoc */
+[data-theme="dark"] .highlight .si { color: #E6DB74 } /* Literal.String.Interpol */
+[data-theme="dark"] .highlight .sx { color: #E6DB74 } /* Literal.String.Other */
+[data-theme="dark"] .highlight .sr { color: #E6DB74 } /* Literal.String.Regex */
+[data-theme="dark"] .highlight .s1 { color: #E6DB74 } /* Literal.String.Single */
+[data-theme="dark"] .highlight .ss { color: #E6DB74 } /* Literal.String.Symbol */
+[data-theme="dark"] .highlight .bp { color: #F8F8F2 } /* Name.Builtin.Pseudo */
+[data-theme="dark"] .highlight .fm { color: #A6E22E } /* Name.Function.Magic */
+[data-theme="dark"] .highlight .vc { color: #F8F8F2 } /* Name.Variable.Class */
+[data-theme="dark"] .highlight .vg { color: #F8F8F2 } /* Name.Variable.Global */
+[data-theme="dark"] .highlight .vi { color: #F8F8F2 } /* Name.Variable.Instance */
+[data-theme="dark"] .highlight .vm { color: #F8F8F2 } /* Name.Variable.Magic */
+[data-theme="dark"] .highlight .il { color: #AE81FF } /* Literal.Number.Integer.Long */
+
+        /* Ensure our code metrics override Pygments defaults */
+        .highlight pre {
+            white-space: pre;
+            margin: 0;
+            padding: var(--code-pad-y) 0.75rem !important;
+            line-height: var(--code-line-height) !important;
+            font-size: var(--code-font-size) !important;
+            font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace !important;
+            border: none;
+        }
+        .line-numbers { line-height: var(--code-line-height) !important; }
+        .line-numbers .line-number { line-height: var(--code-line-height) !important; }
+
+        /* Custom CSS from frontmatter */
+        
+
+        
+        
+        
+        /* Cursor for tools */
+        body[data-tool="arrow"] .main-content { 
+            cursor: url('data:image/svg+xml;utf8,<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="%23e53935" stroke-width="2"><path d="M12 19l7-7 3 3-7 7-3-3z"/><path d="M18 13l-1.5-7.5L2 2l3.5 14.5L13 18l5-5z"/><path d="M2 2l7.586 7.586"/><circle cx="11" cy="11" r="2"/></svg>') 12 12, crosshair;
+        }
+        body[data-tool="pen"] .main-content { 
+            cursor: url('data:image/svg+xml;utf8,<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="%23e53935" stroke-width="2"><path d="M12 19l7-7 3 3-7 7-3-3z"/><path d="M18 13l-1.5-7.5L2 2l3.5 14.5L13 18l5-5z"/><circle cx="4" cy="20" r="2" fill="%23e53935"/></svg>') 4 20, pointer;
+        }
+        body[data-tool="eraser"] .main-content { 
+            cursor: url('data:image/svg+xml;utf8,<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="%23e53935" stroke-width="2"><path d="M20 20H7l-7-7 7-7h13v14z"/><path d="M13 13l7-7"/><path d="M13 13L9 9"/></svg>') 12 12, auto;
+        }
+
+        /* Color picker styles */
+        .tools-section-title {
+            font-weight: bold;
+            color: var(--text-secondary);
+            font-size: 0.65rem;
+            margin: 0.75rem 0 0.5rem 0;
+            text-transform: uppercase;
+            letter-spacing: 0.5px;
+        }
+        .color-row {
+            display: grid;
+            grid-template-columns: repeat(6, 1fr);
+            gap: 0.25rem;
+            margin-bottom: 0.5rem;
+        }
+        .color-swatch {
+            width: 18px;
+            height: 18px;
+            border: 2px solid var(--border-primary);
+            border-radius: 3px;
+            cursor: pointer;
+            transition: all 0.2s ease;
+            position: relative;
+        }
+        .color-swatch:hover {
+            transform: scale(1.1);
+            border-color: var(--text-secondary);
+        }
+        .color-swatch.selected {
+            border-color: var(--text-primary);
+            box-shadow: 0 0 0 2px var(--text-link);
+        }
+        .color-swatch.selected::after {
+            content: '✓';
+            position: absolute;
+            top: 50%;
+            left: 50%;
+            transform: translate(-50%, -50%);
+            color: white;
+            font-size: 10px;
+            font-weight: bold;
+            text-shadow: 1px 1px 1px black;
+        }
+        .color-input {
+            width: 24px;
+            height: 24px;
+            border: 2px solid var(--border-primary);
+            border-radius: 3px;
+            cursor: pointer;
+            background: none;
+            padding: 0;
+            grid-column: span 2;
+            justify-self: center;
+        }
+        .color-input:hover {
+            border-color: var(--text-secondary);
+        }
+        
+        /* Thickness slider styles */
+        .thickness-row {
+            display: flex;
+            align-items: center;
+            gap: 0.5rem;
+            margin-top: 0.75rem;
+        }
+        .thickness-slider {
+            flex: 1;
+            -webkit-appearance: none;
+            appearance: none;
+            height: 4px;
+            background: var(--border-primary);
+            border-radius: 2px;
+            outline: none;
+            opacity: 0.7;
+            transition: opacity 0.2s;
+        }
+        .thickness-slider:hover {
+            opacity: 1;
+        }
+        .thickness-slider::-webkit-slider-thumb {
+            -webkit-appearance: none;
+            appearance: none;
+            width: 12px;
+            height: 12px;
+            background: var(--text-link);
+            border-radius: 50%;
+            cursor: pointer;
+        }
+        .thickness-slider::-moz-range-thumb {
+            width: 12px;
+            height: 12px;
+            background: var(--text-link);
+            border-radius: 50%;
+            cursor: pointer;
+            border: none;
+        }
+        .thickness-value {
+            font-size: 0.7rem;
+            color: var(--text-secondary);
+            min-width: 20px;
+            text-align: right;
+        }
+
+        .highlight {
+            background: none !important;
+        }
+        
+        /* Loading animations */
+        .loading-spinner {
+            display: inline-block;
+            width: 16px;
+            height: 16px;
+            border: 2px solid var(--border-primary);
+            border-radius: 50%;
+            border-top-color: var(--text-link);
+            animation: spin 1s linear infinite;
+            margin-right: 8px;
+            vertical-align: middle;
+        }
+        
+        @keyframes spin {
+            to { transform: rotate(360deg); }
+        }
+        
+        .loading-skeleton {
+            display: inline-block;
+            background: var(--bg-tertiary);
+            background: linear-gradient(
+                90deg,
+                var(--bg-tertiary) 25%,
+                var(--bg-secondary) 50%,
+                var(--bg-tertiary) 75%
+            );
+            background-size: 200% 100%;
+            animation: loading-shimmer 2s ease-in-out infinite;
+            border-radius: 2px;
+            height: 1em;
+            width: 80px;
+            vertical-align: middle;
+        }
+        
+        @keyframes loading-shimmer {
+            0% { background-position: -200% 0; }
+            100% { background-position: 200% 0; }
+        }
+        
+        /* Loading state for cell output */
+        .cell-output:has(.loading-spinner) {
+            opacity: 0.7;
+            background: var(--bg-secondary);
+            /* border-left: 3px solid var(--text-link); */
+        }
+    </style>
+    <script>
+// --- Drag utilities ---
+        function clamp(val, min, max) { return Math.max(min, Math.min(max, val)); }
+
+        function restorePosition(el, storageKey) {
+            try {
+                const raw = localStorage.getItem(storageKey);
+                if (!raw) return;
+                const pos = JSON.parse(raw);
+                if (typeof pos.left === 'number' && typeof pos.top === 'number') {
+                    el.style.left = pos.left + 'px';
+                    el.style.top = pos.top + 'px';
+                    el.style.right = 'auto';
+                    el.style.bottom = 'auto';
+                }
+            } catch (_) {}
+        }
+
+        function savePosition(el, storageKey) {
+            try {
+                const left = parseFloat(el.style.left || 'NaN');
+                const top = parseFloat(el.style.top || 'NaN');
+                if (!Number.isNaN(left) && !Number.isNaN(top)) {
+                    localStorage.setItem(storageKey, JSON.stringify({ left, top }));
+                }
+            } catch (_) {}
+        }
+
+
+        function makeDraggable(el, storageKey, handleEl) {
+            let dragging = false;
+            let startX = 0, startY = 0; // cursor
+            let origLeft = 0, origTop = 0; // element
+
+            const onMove = (e) => {
+                if (!dragging) return;
+                const clientX = e.touches ? e.touches[0].clientX : e.clientX;
+                const clientY = e.touches ? e.touches[0].clientY : e.clientY;
+                const dx = clientX - startX;
+                const dy = clientY - startY;
+                const w = el.offsetWidth;
+                const h = el.offsetHeight;
+                const maxX = window.innerWidth - w;
+                const maxY = window.innerHeight - h;
+                const newLeft = clamp(origLeft + dx, 0, maxX);
+                const newTop = clamp(origTop + dy, 0, maxY);
+                el.style.left = newLeft + 'px';
+                el.style.top = newTop + 'px';
+                el.style.right = 'auto';
+                el.style.bottom = 'auto';
+            };
+
+            const endDrag = () => {
+                if (!dragging) return;
+                dragging = false;
+                document.removeEventListener('mousemove', onMove);
+                document.removeEventListener('mouseup', endDrag);
+                document.removeEventListener('touchmove', onMove);
+                document.removeEventListener('touchend', endDrag);
+                handleEl && (handleEl.style.cursor = 'grab');
+                savePosition(el, storageKey);
+                // ensure no-overlap constraint after a drag
+                try { layoutWidgetsStackedBottomRight(); } catch (_) {}
+            };
+
+            const startDrag = (e) => {
+                // Start from element's current on-screen rect
+                const elRect = el.getBoundingClientRect();
+                el.style.left = elRect.left + 'px';
+                el.style.top = elRect.top + 'px';
+                el.style.right = 'auto';
+                el.style.bottom = 'auto';
+
+                dragging = true;
+                startX = e.touches ? e.touches[0].clientX : e.clientX;
+                startY = e.touches ? e.touches[0].clientY : e.clientY;
+                origLeft = elRect.left;
+                origTop = elRect.top;
+
+                document.addEventListener('mousemove', onMove);
+                document.addEventListener('mouseup', endDrag);
+                document.addEventListener('touchmove', onMove, { passive: false });
+                document.addEventListener('touchend', endDrag);
+                handleEl && (handleEl.style.cursor = 'grabbing');
+                e.preventDefault();
+            };
+
+            (handleEl || el).addEventListener('mousedown', startDrag);
+            (handleEl || el).addEventListener('touchstart', startDrag, { passive: false });
+
+            // Apply any saved position on init
+            restorePosition(el, storageKey);
+        }
+        function toggleCell(cellId) {
+            const codeElement = document.getElementById('code-' + cellId);
+            const outputElement = document.getElementById('output-' + cellId);
+            
+            if (codeElement) {
+                codeElement.classList.toggle('collapsed');
+            }
+            if (outputElement) {
+                outputElement.classList.toggle('collapsed');
+            }
+            
+            updateIndicators(cellId);
+            encodeToolStateToUrl();
+        }
+        
+        function toggleCode(cellId) {
+            const codeElement = document.getElementById('code-' + cellId);
+            if (codeElement) {
+                codeElement.classList.toggle('collapsed');
+                updateIndicators(cellId);
+                encodeToolStateToUrl();
+            }
+        }
+        
+        function toggleOutput(cellId) {
+            const outputElement = document.getElementById('output-' + cellId);
+            if (outputElement) {
+                outputElement.classList.toggle('collapsed');
+                updateIndicators(cellId);
+                encodeToolStateToUrl();
+            }
+        }
+
+        function toggleUvLogs(headerElement) {
+            const contentElement = headerElement.nextElementSibling;
+            if (contentElement) {
+                const isCollapsed = contentElement.style.display === 'none';
+                contentElement.style.display = isCollapsed ? 'block' : 'none';
+                headerElement.textContent = isCollapsed ? '▼ UV Install Logs' : '▶ UV Install Logs';
+
+                // Update the header indicator if it exists
+                const uvLogsDiv = headerElement.parentElement;
+                if (uvLogsDiv && uvLogsDiv.id && uvLogsDiv.id.startsWith('uv-logs-')) {
+                    const cellId = uvLogsDiv.id.replace('uv-logs-', '');
+                    const indicatorElement = document.getElementById('uv-indicator-' + cellId);
+                    if (indicatorElement) {
+                        indicatorElement.textContent = isCollapsed ? '▼ uv-logs' : '▶ uv-logs';
+                    }
+                }
+            }
+        }
+
+        function toggleUvLogsFromHeader(cellId) {
+            const uvLogsElement = document.getElementById('uv-logs-' + cellId);
+            const indicatorElement = document.getElementById('uv-indicator-' + cellId);
+            if (uvLogsElement) {
+                const headerElement = uvLogsElement.querySelector('.uv-logs-header');
+                const contentElement = uvLogsElement.querySelector('.uv-logs-content');
+                if (contentElement && headerElement) {
+                    const isCollapsed = contentElement.style.display === 'none';
+                    contentElement.style.display = isCollapsed ? 'block' : 'none';
+                    headerElement.textContent = isCollapsed ? '▼ UV Install Logs' : '▶ UV Install Logs';
+                    if (indicatorElement) {
+                        indicatorElement.textContent = isCollapsed ? '▼ uv-logs' : '▶ uv-logs';
+                    }
+                }
+            }
+        }
+        
+        function updateIndicators(cellId) {
+            const codeElement = document.getElementById('code-' + cellId);
+            const outputElement = document.getElementById('output-' + cellId);
+            const indicators = document.querySelector(`[onclick*="${cellId}"]`)?.closest('.cell-header')?.querySelector('.collapse-indicators');
+            
+            if (indicators) {
+                const codeCollapsed = codeElement && codeElement.classList.contains('collapsed');
+                const outputCollapsed = outputElement && outputElement.classList.contains('collapsed');
+                
+                const codeIcon = codeCollapsed ? '▶' : '▼';
+                const outputIcon = outputCollapsed ? '▶' : '▼';
+                
+                const codeSpan = indicators.querySelector('[onclick*="toggleCode"]');
+                const outputSpan = indicators.querySelector('[onclick*="toggleOutput"]');
+                
+                if (codeSpan) codeSpan.innerHTML = `${codeIcon} code`;
+                if (outputSpan) outputSpan.innerHTML = `${outputIcon} output`;
+            }
+        }
+        
+        function toggleTheme() {
+            const html = document.documentElement;
+            const currentTheme = html.getAttribute('data-theme');
+            const newTheme = currentTheme === 'dark' ? 'light' : 'dark';
+            html.setAttribute('data-theme', newTheme);
+            localStorage.setItem('uvnote-theme', newTheme);
+            updateThemeIcon();
+            updateUiDebug();
+        }
+        
+        // Two panel code removed
+        
+        function updateThemeIcon() {
+            const theme = document.documentElement.getAttribute('data-theme');
+            const toggle = document.querySelector('.theme-toggle');
+            if (toggle) {
+                toggle.textContent = theme === 'dark' ? 'light' : 'dark';
+            }
+        }
+        function setUiTheme(newUi) {
+            if (newUi !== 'default' && newUi !== 'none' && newUi !== 'monocolor') return;
+            const html = document.documentElement;
+            html.setAttribute('data-ui', newUi);
+            try { localStorage.setItem('uvnote-ui', newUi); } catch (_) {}
+            updateUiMenu();
+            updateUiDebug();
+        }
+        function updateUiMenu() {
+            const ui = document.documentElement.getAttribute('data-ui') || 'default';
+            const checks = {
+                default: document.getElementById('checkbox-ui-default'),
+                none: document.getElementById('checkbox-ui-none'),
+                monocolor: document.getElementById('checkbox-ui-monocolor')
+            };
+            if (checks.default) checks.default.textContent = ui === 'default' ? '☑' : '☐';
+            if (checks.none) checks.none.textContent = ui === 'none' ? '☑' : '☐';
+            if (checks.monocolor) checks.monocolor.textContent = ui === 'monocolor' ? '☑' : '☐';
+        }
+
+        function updateUiDebug() {
+            const ui = document.documentElement.getAttribute('data-ui') || 'default';
+            const color = document.documentElement.getAttribute('data-theme') || 'light';
+            const el = document.getElementById('ui-debug');
+            if (el) {
+                el.textContent = `UI: ${ui} | Color: ${color}`;
+            }
+        }
+
+        // Line selection and deep-linking
+        function clearLineSelections() {
+            try {
+                document.querySelectorAll('.code-line-highlight').forEach(el => { el.style.display = 'none'; });
+                document.querySelectorAll('.line-number.selected').forEach(el => el.classList.remove('selected'));
+            } catch (_) {}
+        }
+
+        let _selection = null; // { cellId, a, b }
+
+        function clearSelection(updateUrl) {
+            clearLineSelections();
+            _selection = null;
+            if (updateUrl) {
+                try {
+                    const url = new URL(window.location.href);
+                    url.searchParams.delete('cell');
+                    url.searchParams.delete('line');
+                    history.replaceState(null, '', url.toString());
+                } catch (_) {}
+            }
+            updateStateIndicator();
+        }
+
+        function selectCellLine(cellId, line, updateUrl) {
+            try {
+                // Ensure only one selection across the whole document
+                clearLineSelections();
+                const codeBox = document.getElementById(`code-${cellId}`);
+                if (!codeBox) return;
+                const pre = codeBox.querySelector('.highlight pre');
+                const overlay = document.getElementById(`line-highlight-${cellId}`);
+                const numbers = document.getElementById(`lines-${cellId}`);
+                if (!pre || !overlay) return;
+
+                // Measure line height directly from computed style
+                const preStyle = getComputedStyle(pre);
+                const padTop = parseFloat(preStyle.paddingTop || '0');
+                const lh = parseFloat(preStyle.lineHeight || '20');
+
+                // Position overlay
+                overlay.style.display = 'block';
+                overlay.style.height = `${lh}px`;
+                overlay.style.top = `${pre.offsetTop + padTop + (line - 1) * lh}px`;
+
+                // Update selected class in line numbers
+                if (numbers) {
+                    numbers.querySelectorAll('.line-number').forEach(a => a.classList.remove('selected'));
+                    const sel = numbers.querySelector(`.line-number[data-line="${line}"]`);
+                    if (sel) sel.classList.add('selected');
+                }
+
+                if (updateUrl) {
+                    const url = new URL(window.location.href);
+                    url.searchParams.set('cell', cellId);
+                    url.searchParams.set('line', String(line));
+                    history.replaceState(null, '', url.toString());
+                }
+                _selection = { cellId, a: line, b: line };
+                updateStateIndicator();
+            } catch (e) { console.warn('selectCellLine error', e); }
+        }
+
+        function selectCellLines(cellId, startLine, endLine, updateUrl) {
+            try {
+                // normalize order
+                const a = Math.min(startLine, endLine);
+                const b = Math.max(startLine, endLine);
+                clearLineSelections();
+                const codeBox = document.getElementById(`code-${cellId}`);
+                if (!codeBox) return;
+                const pre = codeBox.querySelector('.highlight pre');
+                const overlay = document.getElementById(`line-highlight-${cellId}`);
+                const numbers = document.getElementById(`lines-${cellId}`);
+                if (!pre || !overlay) return;
+
+                const preStyle = getComputedStyle(pre);
+                const padTop = parseFloat(preStyle.paddingTop || '0');
+                const lh = parseFloat(preStyle.lineHeight || '20');
+
+                overlay.style.display = 'block';
+                overlay.style.top = `${pre.offsetTop + padTop + (a - 1) * lh}px`;
+                overlay.style.height = `${(b - a + 1) * lh}px`;
+
+                if (numbers) {
+                    numbers.querySelectorAll('.line-number').forEach(a => a.classList.remove('selected'));
+                    for (let i = a; i <= b; i++) {
+                        const el = numbers.querySelector(`.line-number[data-line="${i}"]`);
+                        if (el) el.classList.add('selected');
+                    }
+                }
+
+                if (updateUrl) {
+                    const url = new URL(window.location.href);
+                    url.searchParams.set('cell', cellId);
+                    if (a === b) url.searchParams.set('line', String(a));
+                    else url.searchParams.set('line', `${a}-${b}`);
+                    history.replaceState(null, '', url.toString());
+                }
+                _selection = { cellId, a, b };
+                updateStateIndicator();
+            } catch (e) { console.warn('selectCellLines error', e); }
+        }
+
+        // Drag-to-select support on line numbers
+        let _lineDrag = { active: false, cellId: null, start: 0 };
+        function onLineNumberMouseDown(e) {
+            const a = e.target.closest('.line-number');
+            if (!a) return;
+            e.preventDefault();
+            const cellId = a.dataset.cell;
+            const line = parseInt(a.dataset.line || '1', 10) || 1;
+            // Toggle off if this exact single line is already the only selection
+            const numbers = document.getElementById(`lines-${cellId}`);
+            if (numbers) {
+                const selected = Array.from(numbers.querySelectorAll('.line-number.selected')).map(n => parseInt(n.dataset.line||'0',10)).filter(Boolean);
+                if (selected.length === 1 && selected[0] === line) {
+                    clearSelection(true);
+                    return;
+                }
+            }
+            _lineDrag.active = true;
+            _lineDrag.cellId = cellId;
+            _lineDrag.start = line;
+            selectCellLines(_lineDrag.cellId, _lineDrag.start, _lineDrag.start, false);
+        }
+        function onDocMouseMove(e) {
+            if (!_lineDrag.active) return;
+            const el = document.elementFromPoint(e.clientX, e.clientY);
+            if (!el) return;
+            const a = el.closest && el.closest('.line-number');
+            if (!a) return;
+            if (a.dataset.cell !== _lineDrag.cellId) return;
+            const cur = parseInt(a.dataset.line || '1', 10) || 1;
+            selectCellLines(_lineDrag.cellId, _lineDrag.start, cur, false);
+        }
+        function onDocMouseUp(e) {
+            if (!_lineDrag.active) return;
+            const last = document.querySelector('.line-number.selected:last-of-type');
+            // finalize URL using the current selected range
+            const numbers = document.getElementById(`lines-${_lineDrag.cellId}`);
+            if (numbers) {
+                const selected = Array.from(numbers.querySelectorAll('.line-number.selected')).map(n => parseInt(n.dataset.line||'0',10)).filter(Boolean);
+                if (selected.length) {
+                    const a = Math.min(...selected); const b = Math.max(...selected);
+                    selectCellLines(_lineDrag.cellId, a, b, true);
+                }
+            }
+            _lineDrag.active = false; _lineDrag.cellId = null; _lineDrag.start = 0;
+        }
+
+        function applyLocationFromUrl() {
+            try {
+                const url = new URL(window.location.href);
+                const cell = url.searchParams.get('cell');
+                const lineParam = url.searchParams.get('line');
+                if (cell && lineParam) {
+                    if (lineParam.includes('-')) {
+                        const [a, b] = lineParam.split('-').map(x => parseInt(x, 10));
+                        if (!Number.isNaN(a) && !Number.isNaN(b)) selectCellLines(cell, a, b, false);
+                    } else {
+                        const l = parseInt(lineParam, 10);
+                        if (!Number.isNaN(l)) selectCellLine(cell, l, false);
+                    }
+                }
+                
+                // Apply tool parameters from URL
+                applyToolsFromUrl(url.searchParams);
+                
+                // Cell states will be applied later in DOMContentLoaded with proper timing
+                const encodedCellStates = url.searchParams.get('cells');
+                console.log('Encoded cell states from URL:', encodedCellStates);
+            } catch (_) {}
+        }
+        
+        function applyToolsFromUrl(params) {
+            try {
+                // Check if tools widget should be shown
+                const showTools = params.get('tools');
+                if (showTools === '1') {
+                    // Mark that tool was loaded from URL
+                    _urlLoadedTool = true;
+                    
+                    // Apply color
+                    const color = params.get('color');
+                    if (color && /^[0-9a-fA-F]{6}$/.test(color)) {
+                        setStoredArrowColor('#' + color);
+                    }
+                    
+                    // Apply thickness
+                    const thickness = params.get('thickness');
+                    if (thickness) {
+                        const value = parseInt(thickness, 10);
+                        if (value >= 1 && value <= 10) {
+                            setStoredLineThickness(value);
+                        }
+                    }
+                    
+                    // Don't override fadeout time for URL-loaded tools - let individual shapes decide
+                    
+                    // Load shapes from URL
+                    const encodedShapes = params.get('shapes');
+                    if (encodedShapes) {
+                        const decodedShapes = decodeShapesFromUrl(encodedShapes);
+                        if (decodedShapes.length > 0) {
+                            _shapes = decodedShapes;
+                            saveShapes();
+                            // Trigger render after overlay is initialized
+                            setTimeout(() => {
+                                renderOverlay();
+                            }, 300);
+                        }
+                    }
+                    
+                    // Wait for widgets to be initialized before showing tools
+                    setTimeout(() => {
+                        const toolsWidget = document.querySelector('.tools-widget');
+                        const checkbox = document.getElementById('checkbox-tools');
+                        if (toolsWidget && checkbox) {
+                            toolsWidget.style.display = 'block';
+                            checkbox.textContent = '☑';
+                            localStorage.setItem('uvnote-widget-tools', 'visible');
+                        }
+                        
+                        // Apply active tool
+                        const activeTool = params.get('tool');
+                        if (activeTool && ['arrow', 'pen', 'eraser', 'spotlight'].includes(activeTool)) {
+                            const toolBtn = Array.from(document.querySelectorAll('.tool-button')).find(btn => btn.textContent === activeTool);
+                            if (toolBtn) {
+                                toolBtn.click();
+                            }
+                        }
+                        
+                        // Re-layout widgets after showing tools
+                        layoutWidgetsStackedBottomRight();
+                    }, 200);
+                }
+            } catch (_) {}
+        }
+        
+        function captureInitialCellStates() {
+            const cells = document.querySelectorAll('.cell');
+            cells.forEach(cell => {
+                const cellId = cell.id.replace('cell-', '');
+                const codeEl = document.getElementById('code-' + cellId);
+                const outputEl = document.getElementById('output-' + cellId);
+                
+                if (codeEl || outputEl) {
+                    const state = {};
+                    if (codeEl) {
+                        state.c = codeEl.classList.contains('collapsed') ? 0 : 1;
+                    }
+                    if (outputEl) {
+                        state.o = outputEl.classList.contains('collapsed') ? 0 : 1;
+                    }
+                    _initialCellStates[cellId] = state;
+                }
+            });
+            console.log('Captured initial cell states:', _initialCellStates);
+        }
+
+        function encodeCellStatesToUrl() {
+            // Get all cells and their collapse states
+            const cells = document.querySelectorAll('.cell');
+            const cellStates = {};
+            
+            console.log('Found cells:', cells.length);
+            
+            cells.forEach(cell => {
+                const cellId = cell.id.replace('cell-', '');
+                const codeEl = document.getElementById('code-' + cellId);
+                const outputEl = document.getElementById('output-' + cellId);
+                const initialState = _initialCellStates[cellId] || {};
+                
+                console.log(`Encoding cell ${cellId}:`, {
+                    codeEl: !!codeEl,
+                    outputEl: !!outputEl,
+                    codeCollapsed: codeEl ? codeEl.classList.contains('collapsed') : 'N/A',
+                    outputCollapsed: outputEl ? outputEl.classList.contains('collapsed') : 'N/A',
+                    initialState: initialState
+                });
+                
+                if (codeEl || outputEl) {
+                    const state = {};
+                    let hasChanges = false;
+                    
+                    if (codeEl) {
+                        const currentCodeState = codeEl.classList.contains('collapsed') ? 0 : 1;
+                        const initialCodeState = initialState.c;
+                        // Only encode if different from initial state
+                        if (initialCodeState !== undefined && currentCodeState !== initialCodeState) {
+                            state.c = currentCodeState;
+                            hasChanges = true;
+                        }
+                    }
+                    
+                    if (outputEl) {
+                        const currentOutputState = outputEl.classList.contains('collapsed') ? 0 : 1;
+                        const initialOutputState = initialState.o;
+                        // Only encode if different from initial state
+                        if (initialOutputState !== undefined && currentOutputState !== initialOutputState) {
+                            state.o = currentOutputState;
+                            hasChanges = true;
+                        }
+                    }
+                    
+                    // Only include cell if it has changes from initial state
+                    if (hasChanges) {
+                        cellStates[cellId] = state;
+                        console.log(`Added cell ${cellId}:`, state);
+                    }
+                }
+            });
+            
+            console.log('Final cell states to encode:', cellStates);
+            
+            // Return empty string if no changed cells
+            if (Object.keys(cellStates).length === 0) return '';
+            
+            // Encode as compact base64 string
+            const encoded = btoa(JSON.stringify(cellStates));
+            console.log('Encoded cell states:', encoded);
+            return encoded;
+        }
+        
+        function decodeCellStatesFromUrl(encodedStates) {
+            if (!encodedStates) return {};
+            
+            try {
+                return JSON.parse(atob(encodedStates));
+            } catch (e) {
+                console.error('Failed to decode cell states:', e);
+                return {};
+            }
+        }
+        
+        function applyCellStatesFromUrl(cellStates) {
+            console.log('Applying cell states from URL:', cellStates);
+            Object.entries(cellStates).forEach(([cellId, state]) => {
+                const codeEl = document.getElementById('code-' + cellId);
+                const outputEl = document.getElementById('output-' + cellId);
+                
+                console.log(`Cell ${cellId}:`, {
+                    codeEl: !!codeEl,
+                    outputEl: !!outputEl,
+                    state: state
+                });
+                
+                if (codeEl && state.c !== undefined) {
+                    if (state.c === 0) {
+                        codeEl.classList.add('collapsed');
+                        console.log(`Collapsed code for cell ${cellId}`, {
+                            hasCollapsedClass: codeEl.classList.contains('collapsed'),
+                            computedDisplay: getComputedStyle(codeEl).display,
+                            classList: Array.from(codeEl.classList),
+                            elementId: codeEl.id
+                        });
+                    } else {
+                        codeEl.classList.remove('collapsed');
+                        codeEl.classList.add('expanded'); // Explicitly add expanded class
+                        console.log(`Expanded code for cell ${cellId}`, {
+                            hasCollapsedClass: codeEl.classList.contains('collapsed'),
+                            hasExpandedClass: codeEl.classList.contains('expanded'),
+                            computedDisplay: getComputedStyle(codeEl).display,
+                            classList: Array.from(codeEl.classList),
+                            elementId: codeEl.id
+                        });
+                    }
+                }
+                
+                if (outputEl && state.o !== undefined) {
+                    if (state.o === 0) {
+                        outputEl.classList.add('collapsed');
+                        console.log(`Collapsed output for cell ${cellId}`);
+                    } else {
+                        outputEl.classList.remove('collapsed');
+                        console.log(`Expanded output for cell ${cellId}`);
+                    }
+                }
+                
+                // Update visual indicators and force style recalculation
+                try {
+                    updateIndicators(cellId);
+                    // Force browser to recalculate styles
+                    if (codeEl) {
+                        codeEl.offsetHeight; // Force reflow
+                        console.log(`After indicators update - code visible: ${getComputedStyle(codeEl).display !== 'none'}`);
+                    }
+                    if (outputEl) {
+                        outputEl.offsetHeight; // Force reflow
+                        console.log(`After indicators update - output visible: ${getComputedStyle(outputEl).display !== 'none'}`);
+                    }
+                } catch (e) {
+                    console.error(`Error updating indicators for cell ${cellId}:`, e);
+                }
+            });
+        }
+
+        function encodeShapesToUrl() {
+            // Encode shapes as compact base64 string
+            if (_shapes.length === 0) return '';
+            
+            const shapeData = _shapes.map(shape => {
+                const baseData = {
+                    ct: shape.createdAt, // creation timestamp
+                    fo: shape.fadeoutTime || getFadeoutTime() // fadeout time for this shape
+                };
+                
+                if (shape.type === 'arrow') {
+                    return {
+                        ...baseData,
+                        t: 'a',
+                        x1: Math.round(shape.x1),
+                        y1: Math.round(shape.y1),
+                        x2: Math.round(shape.x2),
+                        y2: Math.round(shape.y2),
+                        c: shape.color.substring(1), // remove #
+                        w: shape.width
+                    };
+                } else if (shape.type === 'pen') {
+                    return {
+                        ...baseData,
+                        t: 'p',
+                        pts: shape.points.map(p => [Math.round(p.x), Math.round(p.y)]),
+                        c: shape.color.substring(1),
+                        w: shape.width
+                    };
+                } else if (shape.type === 'spotlight') {
+                    return {
+                        ...baseData,
+                        t: 's',
+                        x: Math.round(shape.x),
+                        y: Math.round(shape.y),
+                        r: Math.round(shape.radius)
+                    };
+                }
+            }).filter(Boolean);
+            
+            return btoa(JSON.stringify(shapeData));
+        }
+        
+        function decodeShapesFromUrl(encodedShapes) {
+            if (!encodedShapes) return [];
+            
+            try {
+                const shapeData = JSON.parse(atob(encodedShapes));
+                return shapeData.map(data => {
+                    const base = {
+                        createdAt: data.ct || Date.now(), // use encoded timestamp or current time
+                        fadeoutTime: data.fo || 0, // use encoded fadeout time or 0 (never fade)
+                        opacity: 1.0
+                    };
+                    
+                    if (data.t === 'a') {
+                        return {
+                            ...base,
+                            type: 'arrow',
+                            x1: data.x1,
+                            y1: data.y1,
+                            x2: data.x2,
+                            y2: data.y2,
+                            color: '#' + data.c,
+                            width: data.w
+                        };
+                    } else if (data.t === 'p') {
+                        return {
+                            ...base,
+                            type: 'pen',
+                            points: data.pts.map(([x, y]) => ({ x, y })),
+                            color: '#' + data.c,
+                            width: data.w
+                        };
+                    } else if (data.t === 's') {
+                        return {
+                            ...base,
+                            type: 'spotlight',
+                            x: data.x,
+                            y: data.y,
+                            radius: data.r,
+                            color: '#000000'
+                        };
+                    }
+                }).filter(Boolean);
+            } catch (e) {
+                console.error('Failed to decode shapes:', e);
+                return [];
+            }
+        }
+        
+        function encodeToolStateToUrl() {
+            // Don't update URL during initialization
+            if (_isInitializing) {
+                return window.location.href;
+            }
+            
+            const params = new URLSearchParams(window.location.search);
+            
+            // Check if tools widget is visible and has an active tool
+            const toolsWidget = document.querySelector('.tools-widget');
+            const activeTool = document.body.dataset.tool;
+            const hasActiveTool = activeTool && activeTool !== 'none';
+            const toolsWidgetVisible = toolsWidget && getComputedStyle(toolsWidget).display !== 'none';
+            
+            // Always handle shapes regardless of tool state
+            const encodedShapes = encodeShapesToUrl();
+            if (encodedShapes) {
+                params.set('shapes', encodedShapes);
+            } else {
+                params.delete('shapes');
+            }
+            
+            // Always preserve existing cell states from URL if present
+            const existingCellStates = params.get('cells');
+            if (existingCellStates) {
+                // Keep existing cell states - don't re-encode from DOM
+                params.set('cells', existingCellStates);
+            } else {
+                // Only encode new cell states if none exist in URL
+                const encodedCellStates = encodeCellStatesToUrl();
+                if (encodedCellStates) {
+                    params.set('cells', encodedCellStates);
+                }
+            }
+            
+            if (toolsWidgetVisible && hasActiveTool) {
+                // Include tool params when widget is visible AND tool is active
+                params.set('tools', '1');
+                params.set('tool', activeTool);
+                
+                // Get color (without # prefix)
+                const color = getArrowColor();
+                if (color && color.startsWith('#')) {
+                    params.set('color', color.substring(1));
+                }
+                
+                // Get thickness
+                const thickness = getLineThickness();
+                params.set('thickness', thickness.toString());
+            } else {
+                // Remove tool state params but keep shapes
+                params.delete('tools');
+                params.delete('tool');
+                params.delete('color');
+                params.delete('thickness');
+                params.delete('fadeout');
+            }
+            
+            // Update URL without reloading
+            const newUrl = window.location.pathname + (params.toString() ? '?' + params.toString() : '') + window.location.hash;
+            window.history.replaceState(null, '', newUrl);
+            
+            return window.location.href;
+        }
+
+        function resetLayout() {
+            try {
+                // Clear all uvnote-* keys
+                const allKeys = Object.keys(localStorage);
+                const uvnoteKeys = allKeys.filter(key => key.startsWith('uvnote-'));
+                uvnoteKeys.forEach(k => localStorage.removeItem(k));
+            } catch (_) {}
+            
+            // Clear any active selection and remove URL params
+            try { clearSelection(true); } catch(_) {}
+            // Reset active tool if any
+            try { window.setActiveTool('none'); } catch(_) {}
+            // Clear shapes
+            try { _shapes = []; saveShapes(); } catch(_) {}
+            // Reset URL-loaded tool flag
+            try { _urlLoadedTool = false; } catch(_) {}
+            // Reset all cells to expanded state
+            try {
+                const cells = document.querySelectorAll('.cell');
+                cells.forEach(cell => {
+                    const cellId = cell.id.replace('cell-', '');
+                    const codeEl = document.getElementById('code-' + cellId);
+                    const outputEl = document.getElementById('output-' + cellId);
+                    if (codeEl) codeEl.classList.remove('collapsed');
+                    if (outputEl) outputEl.classList.remove('collapsed');
+                    updateIndicators(cellId);
+                });
+            } catch(_) {}
+            
+            // Clear ALL URL parameters and reload with clean URL
+            try {
+                const cleanUrl = window.location.pathname + window.location.hash;
+                window.location.href = cleanUrl; // Use window.location.href instead of history.replaceState + reload
+            } catch (_) {
+                // Fallback - reload current page
+                location.reload();
+            }
+        }
+
+        function toggleMenu() {
+            const menuButton = document.querySelector('.menu-button');
+            if (menuButton) {
+                menuButton.classList.toggle('active');
+            }
+        }
+
+        function toggleWidget(widgetName) {
+            let widget;
+            let checkbox;
+            
+            // Close the menu first
+            const menuButton = document.querySelector('.menu-button');
+            if (menuButton) {
+                menuButton.classList.remove('active');
+            }
+            
+            switch(widgetName) {
+                case 'tools':
+                    widget = document.querySelector('.tools-widget');
+                    checkbox = document.getElementById('checkbox-tools');
+                    break;
+                case 'file-explorer':
+                    widget = document.querySelector('.file-explorer');
+                    checkbox = document.getElementById('checkbox-file-explorer');
+                    break;
+                case 'minimap':
+                    widget = document.querySelector('.minimap');
+                    checkbox = document.getElementById('checkbox-minimap');
+                    break;
+                case 'status':
+                    widget = document.querySelector('.status-widget');
+                    checkbox = document.getElementById('checkbox-status');
+                    break;
+                default:
+                    return;
+            }
+            
+            if (widget && checkbox) {
+                const isVisible = getComputedStyle(widget).display !== 'none';
+                widget.style.display = isVisible ? 'none' : 'block';
+                checkbox.textContent = isVisible ? '☐' : '☑';
+                
+                // Save state to localStorage
+                try {
+                    localStorage.setItem(`uvnote-widget-${widgetName}`, isVisible ? 'hidden' : 'visible');
+                } catch (_) {}
+                
+                // Re-layout widgets after visibility change
+                try { 
+                    layoutWidgetsStackedBottomRight(); 
+                } catch (_) {}
+                
+                // Update URL when tools widget visibility changes
+                if (widgetName === 'tools') {
+                    encodeToolStateToUrl();
+                }
+            }
+        }
+        
+        function initializeWidgetVisibility() {
+            const widgets = [
+                { name: 'tools', selector: '.tools-widget' },
+                { name: 'file-explorer', selector: '.file-explorer' },
+                { name: 'minimap', selector: '.minimap' },
+                { name: 'status', selector: '.status-widget' }
+            ];
+            
+            widgets.forEach(({ name, selector }) => {
+                const defaultState = name === 'status' ? 'visible' : 'hidden';
+                const savedState = localStorage.getItem(`uvnote-widget-${name}`) || defaultState;
+                const widget = document.querySelector(selector);
+                const checkbox = document.getElementById(`checkbox-${name}`);
+                
+                if (widget && checkbox) {
+                    const isVisible = savedState === 'visible';
+                    widget.style.display = isVisible ? 'block' : 'none';
+                    checkbox.textContent = isVisible ? '☑' : '☐';
+                }
+            });
+        }
+
+        // Close menu when clicking outside
+        document.addEventListener('click', function(event) {
+            const menuButton = document.querySelector('.menu-button');
+            // Don't close if clicking on a menu item (let the item handler close it)
+            if (menuButton && !menuButton.contains(event.target)) {
+                menuButton.classList.remove('active');
+            }
+        });
+
+        // Layout: stack widgets bottom-right and equalize widths
+        function hasCustomWidgetPositions() {
+            try {
+                return (
+                    localStorage.getItem('uvnote-minimap-pos') ||
+                    localStorage.getItem('uvnote-file-explorer-pos') ||
+                    localStorage.getItem('uvnote-tools-pos')
+                );
+            } catch (_) { return false; }
+        }
+
+        function rectsOverlap(r1, r2) {
+            return !(r1.right <= r2.left || r2.right <= r1.left || r1.bottom <= r2.top || r2.bottom <= r1.top);
+        }
+
+        function widgetsOverlap(widgets) {
+            for (let i = 0; i < widgets.length; i++) {
+                const a = widgets[i];
+                const ra = a.getBoundingClientRect();
+                for (let j = i + 1; j < widgets.length; j++) {
+                    const b = widgets[j];
+                    const rb = b.getBoundingClientRect();
+                    if (rectsOverlap(ra, rb)) return true;
+                }
+            }
+            return false;
+        }
+
+        function applyStackLayout(widgets, order) {
+            if (!widgets.length) return;
+            // Fixed equal width
+            const fixedWidth = 220;
+            widgets.forEach(el => { el.style.width = fixedWidth + 'px'; });
+
+            // Fit heights if needed to avoid overflow
+            const gap = 12;
+            const available = Math.max(0, window.innerHeight - 40 - gap * (order.length - 1));
+            const eachMax = Math.floor(available / order.length);
+            order.forEach(el => {
+                el.style.maxHeight = eachMax + 'px';
+                el.style.overflowY = 'auto';
+            });
+
+            // Stack bottom-up in the requested order
+            let bottomOffset = 20; // base gutter
+            order.forEach(el => {
+                el.style.left = 'auto';
+                el.style.top = 'auto';
+                el.style.right = '20px';
+                el.style.bottom = bottomOffset + 'px';
+                bottomOffset += el.offsetHeight + gap;
+            });
+        }
+
+        function layoutWidgetsStackedBottomRight() {
+            const minimap = document.querySelector('.minimap');
+            const fileExplorer = document.querySelector('.file-explorer');
+            const tools = document.querySelector('.tools-widget');
+            const status = document.querySelector('.status-widget');
+            const widgets = [minimap, fileExplorer, tools, status].filter(el => el && getComputedStyle(el).display !== 'none');
+            if (!widgets.length) return;
+
+            const order = [minimap, fileExplorer, tools, status].filter(Boolean).filter(el => getComputedStyle(el).display !== 'none');
+
+            // If user placed custom positions and there is no overlap, respect them.
+            if (hasCustomWidgetPositions() && !widgetsOverlap(widgets)) return;
+
+            applyStackLayout(widgets, order);
+        }
+        
+        // Panel icon removed
+        
+        let _minimapScrollContainer = null;
+        let _minimapScrollHandler = null;
+        function initMinimap() {
+            // Generate minimap content
+            const minimap = createMinimap();
+            document.body.appendChild(minimap);
+            // Make draggable (use title as handle)
+            const mTitle = minimap.querySelector('.minimap-title');
+            makeDraggable(minimap, 'uvnote-minimap-pos', mTitle);
+
+            // Attach scroll listener to window (two-panel removed)
+            _minimapScrollContainer = window;
+
+            if (_minimapScrollContainer) {
+                _minimapScrollHandler = () => updateMinimapActive();
+                if (_minimapScrollContainer === window) {
+                    window.addEventListener('scroll', _minimapScrollHandler);
+                } else {
+                    _minimapScrollContainer.addEventListener('scroll', _minimapScrollHandler);
+                }
+            }
+            updateMinimapActive();
+        }
+
+        function teardownMinimap() {
+            const minimap = document.querySelector('.minimap');
+            if (minimap && minimap.parentNode) minimap.parentNode.removeChild(minimap);
+            if (_minimapScrollContainer && _minimapScrollHandler) {
+                if (_minimapScrollContainer === window) {
+                    window.removeEventListener('scroll', _minimapScrollHandler);
+                } else {
+                    _minimapScrollContainer.removeEventListener('scroll', _minimapScrollHandler);
+                }
+            }
+            _minimapScrollContainer = null;
+            _minimapScrollHandler = null;
+        }
+        
+        function initFileExplorer() {
+            // Generate file explorer content
+            const fileExplorer = createFileExplorer();
+            document.body.appendChild(fileExplorer);
+        }
+        
+        function createMinimap() {
+            const minimap = document.createElement('div');
+            minimap.className = 'minimap';
+            
+            const title = document.createElement('div');
+            title.className = 'minimap-title';
+            title.textContent = 'navigation';
+            minimap.appendChild(title);
+            
+            // Find all headings and cells
+            const root = document.querySelector('.main-content') || document;
+            const headings = root.querySelectorAll('h1, h2, h3, h4, h5, h6');
+            const cells = root.querySelectorAll('.cell');
+            
+            // Combine and sort by position
+            const items = [];
+            
+            headings.forEach(heading => {
+                const id = heading.id || generateId(heading.textContent);
+                if (!heading.id) heading.id = id;
+                
+                items.push({
+                    element: heading,
+                    type: 'heading',
+                    level: parseInt(heading.tagName.charAt(1)),
+                    text: heading.textContent.trim(),
+                    id: id,
+                    position: heading.getBoundingClientRect().top + window.scrollY
+                });
+            });
+            
+            cells.forEach(cell => {
+                const header = cell.querySelector('.cell-header');
+                if (header) {
+                    const id = cell.id || `cell-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`;
+                    if (!cell.id) cell.id = id;
+                    
+                    items.push({
+                        element: cell,
+                        type: 'cell',
+                        text: header.textContent.trim(),
+                        id: id,
+                        position: cell.getBoundingClientRect().top + window.scrollY
+                    });
+                }
+            });
+            
+            // Sort by position
+            items.sort((a, b) => a.position - b.position);
+            
+            // Create minimap items
+            items.forEach(item => {
+                const link = document.createElement('a');
+                link.className = `minimap-item ${item.type === 'heading' ? 'minimap-heading' : 'minimap-cell'}`;
+                if (item.type === 'heading') {
+                    link.classList.add(`h${item.level}`);
+                }
+                link.textContent = item.text.length > 25 ? item.text.substring(0, 22) + '...' : item.text;
+                link.href = `#${item.id}`;
+                link.onclick = function(e) {
+                    e.preventDefault();
+                    item.element.scrollIntoView({ behavior: 'smooth', block: 'start' });
+                };
+                minimap.appendChild(link);
+            });
+            
+            return minimap;
+        }
+        
+        function generateId(text) {
+            return text.toLowerCase()
+                .replace(/[^a-z0-9]+/g, '-')
+                .replace(/^-+|-+$/g, '')
+                .substring(0, 20);
+        }
+        
+        function updateMinimapActive() {
+            const minimapItems = document.querySelectorAll('.minimap-item');
+            const container = _minimapScrollContainer || window;
+            const containerRect = container === window ? null : container.getBoundingClientRect();
+            const scrollPos = (container === window ? window.scrollY : container.scrollTop) + 100; // Offset for better detection
+            
+            let activeItem = null;
+            minimapItems.forEach(item => {
+                const targetId = item.getAttribute('href').substring(1);
+                const target = document.getElementById(targetId);
+                
+                if (target) {
+                    const rectTop = target.getBoundingClientRect().top;
+                    const targetPos = (container === window)
+                        ? rectTop + window.scrollY
+                        : rectTop - containerRect.top + container.scrollTop;
+                    if (targetPos <= scrollPos) {
+                        activeItem = item;
+                    }
+                }
+                
+                item.classList.remove('active');
+            });
+            
+            if (activeItem) {
+                activeItem.classList.add('active');
+            }
+        }
+        
+        function createFileExplorer() {
+            const fileExplorer = document.createElement('div');
+            fileExplorer.className = 'file-explorer';
+            
+            const title = document.createElement('div');
+            title.className = 'file-explorer-title';
+            title.textContent = 'files';
+            fileExplorer.appendChild(title);
+            // Make draggable (use title as handle)
+            makeDraggable(fileExplorer, 'uvnote-file-explorer-pos', title);
+            
+            // Scripts section
+            const scriptsSection = document.createElement('div');
+            scriptsSection.className = 'file-explorer-section';
+            
+            const scriptsTitle = document.createElement('div');
+            scriptsTitle.className = 'file-explorer-section-title';
+            scriptsTitle.textContent = 'scripts';
+            scriptsSection.appendChild(scriptsTitle);
+            
+            // Find all cells and list their script files (single panel)
+            const root = document.querySelector('.main-content') || document;
+            const cells = root.querySelectorAll('.cell');
+            cells.forEach(cell => {
+                const header = cell.querySelector('.cell-header');
+                if (header) {
+                    const cellText = header.textContent.trim();
+                    const cellMatch = cellText.match(/Cell: ([a-zA-Z_][a-zA-Z0-9_]*)/);
+                    if (cellMatch) {
+                        const cellId = cellMatch[1];
+                        const scriptItem = document.createElement('div');
+                        scriptItem.className = 'file-explorer-item script';
+                        scriptItem.textContent = `${cellId}.py`;
+                        scriptItem.onclick = function() {
+                            cell.scrollIntoView({ behavior: 'smooth', block: 'start' });
+                        };
+                        scriptsSection.appendChild(scriptItem);
+                    }
+                }
+            });
+            
+            fileExplorer.appendChild(scriptsSection);
+            
+            // Artifacts section
+            const artifactsSection = document.createElement('div');
+            artifactsSection.className = 'file-explorer-section';
+            
+            const artifactsTitle = document.createElement('div');
+            artifactsTitle.className = 'file-explorer-section-title';
+            artifactsTitle.textContent = 'artifacts';
+            artifactsSection.appendChild(artifactsTitle);
+            
+            // Find all artifact links (single panel)
+            const artifactsRoot = document.querySelector('.main-content') || document;
+            const artifacts = artifactsRoot.querySelectorAll('.artifact');
+            if (artifacts.length === 0) {
+                const noArtifacts = document.createElement('div');
+                noArtifacts.className = 'file-explorer-item artifact';
+                noArtifacts.textContent = '(none)';
+                noArtifacts.style.opacity = '0.5';
+                artifactsSection.appendChild(noArtifacts);
+            } else {
+                artifacts.forEach(artifact => {
+                    const artifactItem = document.createElement('div');
+                    artifactItem.className = 'file-explorer-item artifact';
+                    artifactItem.textContent = artifact.textContent;
+                    artifactItem.onclick = function() {
+                        artifact.click();
+                    };
+                    artifactsSection.appendChild(artifactItem);
+                });
+            }
+            
+            fileExplorer.appendChild(artifactsSection);
+            
+            return fileExplorer;
+        }
+
+        function initStatusWidget() {
+            let el = document.querySelector('.status-widget');
+            if (!el) {
+                el = document.createElement('div');
+                el.className = 'status-widget';
+                el.id = 'status-widget';
+                el.textContent = 'ready — Esc';
+                document.body.appendChild(el);
+            }
+        }
+
+        // Tools widget
+        let _cursorX = 0;
+        let _cursorY = 0;
+        let _cursorVisible = false;
+        
+        function setActiveTool(tool) {
+            if (!tool || tool === 'none') {
+                document.body.dataset.tool = 'none';
+                localStorage.setItem('uvnote-active-tool', 'none');
+                setOverlayActive(false);
+                _cursorVisible = false;
+                // Remove active class from all tool buttons when deactivating
+                const toolButtons = document.querySelectorAll('.tools-widget .tool-button');
+                toolButtons.forEach(btn => btn.classList.remove('active'));
+                updateStateIndicator();
+                encodeToolStateToUrl();
+                return;
+            }
+            document.body.dataset.tool = tool;
+            localStorage.setItem('uvnote-active-tool', tool);
+            setOverlayActive(true);
+            _cursorVisible = true;
+            updateStateIndicator();
+            encodeToolStateToUrl();
+        }
+
+        // Make setActiveTool globally accessible for ESC key handler
+        window.setActiveTool = setActiveTool;
+        
+
+        function getArrowColor() {
+            const saved = localStorage.getItem('uvnote-arrow-color');
+            if (saved) return saved;
+            return '#e53935'; // Default red color
+        }
+
+        function setStoredArrowColor(color) {
+            try { localStorage.setItem('uvnote-arrow-color', color); } catch (_) {}
+        }
+
+        function getLineThickness() {
+            const saved = localStorage.getItem('uvnote-line-thickness');
+            if (saved) return parseInt(saved, 10);
+            return 6; // default thickness
+        }
+
+        function setStoredLineThickness(thickness) {
+            try { localStorage.setItem('uvnote-line-thickness', thickness); } catch (_) {}
+        }
+
+        function getFadeoutTime() {
+            const saved = localStorage.getItem('uvnote-fadeout-time');
+            if (saved) return parseInt(saved, 10);
+            return 5; // default 5 seconds
+        }
+
+        function setStoredFadeoutTime(seconds) {
+            try { localStorage.setItem('uvnote-fadeout-time', seconds); } catch (_) {}
+        }
+
+        function createToolsWidget() {
+            const tools = document.createElement('div');
+            tools.className = 'tools-widget';
+
+            const title = document.createElement('div');
+            title.className = 'tools-title';
+            title.textContent = 'tools';
+            tools.appendChild(title);
+
+            const row = document.createElement('div');
+            row.className = 'tools-row';
+            tools.appendChild(row);
+
+            // Arrow tool
+            const arrowBtn = document.createElement('div');
+            arrowBtn.className = 'tool-button';
+            arrowBtn.textContent = 'arrow';
+            arrowBtn.onclick = function() {
+                const isActive = arrowBtn.classList.contains('active');
+                if (isActive) {
+                    arrowBtn.classList.remove('active');
+                    setActiveTool('none');
+                } else {
+                    tools.querySelectorAll('.tool-button').forEach(b => b.classList.remove('active'));
+                    arrowBtn.classList.add('active');
+                    setActiveTool('arrow');
+                }
+            };
+            row.appendChild(arrowBtn);
+
+            // Pen tool
+            const penBtn = document.createElement('div');
+            penBtn.className = 'tool-button';
+            penBtn.textContent = 'pen';
+            penBtn.onclick = function() {
+                const isActive = penBtn.classList.contains('active');
+                if (isActive) {
+                    penBtn.classList.remove('active');
+                    setActiveTool('none');
+                } else {
+                    tools.querySelectorAll('.tool-button').forEach(b => b.classList.remove('active'));
+                    penBtn.classList.add('active');
+                    setActiveTool('pen');
+                }
+            };
+            row.appendChild(penBtn);
+
+            // Eraser tool
+            const eraseBtn = document.createElement('div');
+            eraseBtn.className = 'tool-button';
+            eraseBtn.textContent = 'eraser';
+            eraseBtn.onclick = function() {
+                const isActive = eraseBtn.classList.contains('active');
+                if (isActive) {
+                    eraseBtn.classList.remove('active');
+                    setActiveTool('none');
+                } else {
+                    tools.querySelectorAll('.tool-button').forEach(b => b.classList.remove('active'));
+                    eraseBtn.classList.add('active');
+                    setActiveTool('eraser');
+                }
+            };
+            row.appendChild(eraseBtn);
+
+            // Spotlight tool
+            const spotlightBtn = document.createElement('div');
+            spotlightBtn.className = 'tool-button';
+            spotlightBtn.textContent = 'spotlight';
+            spotlightBtn.onclick = function() {
+                const isActive = spotlightBtn.classList.contains('active');
+                if (isActive) {
+                    spotlightBtn.classList.remove('active');
+                    setActiveTool('none');
+                } else {
+                    tools.querySelectorAll('.tool-button').forEach(b => b.classList.remove('active'));
+                    spotlightBtn.classList.add('active');
+                    setActiveTool('spotlight');
+                }
+            };
+            row.appendChild(spotlightBtn);
+
+            // Clear all
+            const clearBtn = document.createElement('div');
+            clearBtn.className = 'tool-button';
+            clearBtn.textContent = 'clear';
+            clearBtn.onclick = function() {
+                _shapes = [];
+                saveShapes();
+                renderOverlay();
+            };
+            row.appendChild(clearBtn);
+            
+            // We'll add the copy button at the end of the widget
+
+            // Restore active state from storage
+            const saved = localStorage.getItem('uvnote-active-tool') || 'none';
+            if (saved === 'arrow') {
+                arrowBtn.classList.add('active');
+                setActiveTool('arrow');
+            } else if (saved === 'pen') {
+                penBtn.classList.add('active');
+                setActiveTool('pen');
+            } else if (saved === 'eraser') {
+                eraseBtn.classList.add('active');
+                setActiveTool('eraser');
+            } else if (saved === 'spotlight') {
+                spotlightBtn.classList.add('active');
+                setActiveTool('spotlight');
+            }
+
+            // Color selector
+            const colorTitle = document.createElement('div');
+            colorTitle.className = 'tools-section-title';
+            colorTitle.textContent = 'color';
+            tools.appendChild(colorTitle);
+
+            const colorRow = document.createElement('div');
+            colorRow.className = 'tools-row color-row';
+            tools.appendChild(colorRow);
+
+            const swatchColors = [
+                // Primary colors
+                '#e53935', '#fb8c00', '#fdd835', '#43a047', '#1e88e5', '#8e24aa',
+                // Additional useful colors  
+                '#ff5722', '#795548', '#607d8b', '#9c27b0',
+                // Grayscale
+                '#000000', '#424242', '#9e9e9e', '#ffffff'
+            ];
+            const swatches = [];
+            swatchColors.forEach(c => {
+                const s = document.createElement('div');
+                s.className = 'color-swatch';
+                s.style.backgroundColor = c;
+                s.title = c;
+                s.onclick = () => {
+                    setStoredArrowColor(c);
+                    refreshColorUI(c);
+                    if (_cursorVisible) renderOverlay();
+                    encodeToolStateToUrl();
+                };
+                colorRow.appendChild(s);
+                swatches.push(s);
+            });
+
+            const colorInput = document.createElement('input');
+            colorInput.type = 'color';
+            colorInput.className = 'color-input';
+            colorInput.oninput = () => {
+                setStoredArrowColor(colorInput.value);
+                refreshColorUI(colorInput.value);
+                if (_cursorVisible) renderOverlay();
+                encodeToolStateToUrl();
+            };
+            colorRow.appendChild(colorInput);
+
+            function refreshColorUI(selected) {
+                const selectedHex = selected.startsWith('#') ? selected.toLowerCase() : rgbToHex(selected);
+                
+                swatches.forEach((s, i) => {
+                    const swatchHex = swatchColors[i].toLowerCase();
+                    if (swatchHex === selectedHex) {
+                        s.classList.add('selected');
+                    } else {
+                        s.classList.remove('selected');
+                    }
+                });
+                
+                try { 
+                    colorInput.value = selectedHex; 
+                } catch (_) {}
+            }
+
+            function rgbToHex(rgb) {
+                const m = rgb.match(/rgba?\((\d+),\s*(\d+),\s*(\d+)\)/i);
+                if (!m) return '#000000';
+                const r = parseInt(m[1]).toString(16).padStart(2, '0');
+                const g = parseInt(m[2]).toString(16).padStart(2, '0');
+                const b = parseInt(m[3]).toString(16).padStart(2, '0');
+                return `#${r}${g}${b}`;
+            }
+
+            // Restore color selection
+            refreshColorUI(getArrowColor());
+
+            // Thickness slider
+            const thicknessTitle = document.createElement('div');
+            thicknessTitle.className = 'tools-section-title';
+            thicknessTitle.textContent = 'thickness';
+            tools.appendChild(thicknessTitle);
+
+            const thicknessRow = document.createElement('div');
+            thicknessRow.className = 'thickness-row';
+            tools.appendChild(thicknessRow);
+
+            const thicknessSlider = document.createElement('input');
+            thicknessSlider.type = 'range';
+            thicknessSlider.className = 'thickness-slider';
+            thicknessSlider.min = '1';
+            thicknessSlider.max = '10';
+            thicknessSlider.value = getLineThickness();
+            
+            const thicknessValue = document.createElement('span');
+            thicknessValue.className = 'thickness-value';
+            thicknessValue.textContent = thicknessSlider.value + 'px';
+
+            thicknessSlider.oninput = function() {
+                const value = parseInt(thicknessSlider.value, 10);
+                setStoredLineThickness(value);
+                thicknessValue.textContent = value + 'px';
+                if (_cursorVisible) renderOverlay();
+                encodeToolStateToUrl();
+            };
+
+            thicknessRow.appendChild(thicknessSlider);
+            thicknessRow.appendChild(thicknessValue);
+
+            // Fadeout time slider
+            const fadeoutTitle = document.createElement('div');
+            fadeoutTitle.className = 'tools-section-title';
+            fadeoutTitle.textContent = 'fadeout time';
+            tools.appendChild(fadeoutTitle);
+
+            const fadeoutRow = document.createElement('div');
+            fadeoutRow.className = 'thickness-row';
+            tools.appendChild(fadeoutRow);
+
+            const fadeoutSlider = document.createElement('input');
+            fadeoutSlider.type = 'range';
+            fadeoutSlider.className = 'thickness-slider';
+            fadeoutSlider.min = '0';
+            fadeoutSlider.max = '30';
+            fadeoutSlider.value = getFadeoutTime();
+            
+            const fadeoutValue = document.createElement('span');
+            fadeoutValue.className = 'thickness-value';
+            fadeoutValue.textContent = fadeoutSlider.value === '0' ? 'never' : fadeoutSlider.value + 's';
+
+            fadeoutSlider.oninput = function() {
+                const value = parseInt(fadeoutSlider.value, 10);
+                setStoredFadeoutTime(value);
+                fadeoutValue.textContent = value === 0 ? 'never' : value + 's';
+                encodeToolStateToUrl();
+            };
+
+            fadeoutRow.appendChild(fadeoutSlider);
+            fadeoutRow.appendChild(fadeoutValue);
+
+            // Draggable behavior
+            makeDraggable(tools, 'uvnote-tools-pos', title);
+
+            return tools;
+        }
+
+        function initTools() {
+            const widget = createToolsWidget();
+            document.body.appendChild(widget);
+        }
+
+        function teardownTools() {
+            const w = document.querySelector('.tools-widget');
+            if (w && w.parentNode) w.parentNode.removeChild(w);
+        }
+
+        // --- Canvas overlay for tools ---
+        let _overlay = null;
+        let _overlayCtx = null;
+        let _overlayContainer = null; // window
+        let _overlayMode = 'single';
+        let _overlayResizeHandler = null;
+        let _overlayScrollHandler = null;
+        let _drawing = null; // current in-progress arrow {x1,y1,x2,y2}
+        let _shapes = []; // committed shapes for current mode
+        let _fadeTimer = null; // timer for fade animation
+        let _urlLoadedTool = false; // track if tool was loaded from URL
+        let _isInitializing = true; // prevent URL updates during initialization
+        let _initialCellStates = {}; // track initial cell states from page load
+
+        function getOverlayStorageKey() { return 'uvnote-shapes'; }
+
+        function loadShapes() {
+            try {
+                const raw = localStorage.getItem(getOverlayStorageKey());
+                _shapes = raw ? JSON.parse(raw) : [];
+            } catch (_) { _shapes = []; }
+        }
+
+        function saveShapes() {
+            try { 
+                localStorage.setItem(getOverlayStorageKey(), JSON.stringify(_shapes));
+                // Always update URL when shapes change
+                encodeToolStateToUrl();
+            } catch (_) {}
+        }
+
+        function updateShapesFade() {
+            const now = Date.now();
+            let needsUpdate = false;
+
+            for (let i = _shapes.length - 1; i >= 0; i--) {
+                const shape = _shapes[i];
+                if (!shape.createdAt) continue; // Skip old shapes without timestamps
+                
+                // Use individual shape's fadeout time, or global if not set
+                const shapesFadeoutSeconds = shape.fadeoutTime !== undefined ? shape.fadeoutTime : getFadeoutTime();
+                
+                // Skip fading if fadeout is disabled for this shape
+                if (shapesFadeoutSeconds === 0) continue;
+                
+                const fadeStartTime = Math.max(0, (shapesFadeoutSeconds - 2) * 1000); // Start fading 2s before end
+                const fadeEndTime = shapesFadeoutSeconds * 1000; // Fully gone after specified time
+                const age = now - shape.createdAt;
+                
+                if (age >= fadeEndTime) {
+                    // Remove completely faded shapes
+                    _shapes.splice(i, 1);
+                    needsUpdate = true;
+                } else if (age >= fadeStartTime) {
+                    // Update opacity for fading shapes
+                    const fadeProgress = (age - fadeStartTime) / (fadeEndTime - fadeStartTime);
+                    const newOpacity = 1 - fadeProgress;
+                    if (Math.abs(shape.opacity - newOpacity) > 0.01) {
+                        shape.opacity = newOpacity;
+                        needsUpdate = true;
+                    }
+                }
+            }
+
+            if (needsUpdate) {
+                saveShapes();
+                renderOverlay();
+                // Update URL to remove faded shapes  
+                encodeToolStateToUrl();
+            }
+        }
+
+        function getContentContainer() { return window; }
+
+        function updateOverlayModeAndContainer() {
+            _overlayContainer = window;
+            _overlayMode = 'single';
+        }
+
+        function updateOverlayBounds() {
+            if (!_overlay) return;
+            if (_overlayContainer === window) {
+                _overlay.style.position = 'fixed';
+                _overlay.style.left = '0px';
+                _overlay.style.top = '0px';
+                _overlay.width = window.innerWidth;
+                _overlay.height = window.innerHeight;
+            } else {
+                const rect = _overlayContainer.getBoundingClientRect();
+                _overlay.style.position = 'fixed';
+                _overlay.style.left = rect.left + 'px';
+                _overlay.style.top = rect.top + 'px';
+                _overlay.width = Math.max(0, Math.floor(rect.width));
+                _overlay.height = Math.max(0, Math.floor(rect.height));
+            }
+            renderOverlay();
+        }
+
+        function containerScrollLeft() {
+            return (_overlayContainer === window) ? (window.scrollX || 0) : (_overlayContainer.scrollLeft || 0);
+        }
+        function containerScrollTop() {
+            return (_overlayContainer === window) ? (window.scrollY || 0) : (_overlayContainer.scrollTop || 0);
+        }
+
+        function toCanvasCoords(clientX, clientY) {
+            const rect = _overlay.getBoundingClientRect();
+            return { x: clientX - rect.left, y: clientY - rect.top };
+        }
+
+        function onPointerDown(e) {
+            const tool = document.body.dataset.tool;
+            if (tool === 'arrow') {
+                startDrawArrow(e);
+            } else if (tool === 'pen') {
+                startDrawPen(e);
+            } else if (tool === 'eraser') {
+                eraseAt(e);
+            } else if (tool === 'spotlight') {
+                startDrawSpotlight(e);
+            }
+        }
+
+        function onPointerMove(e) {
+            // Update cursor position
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _cursorX = pt.x;
+            _cursorY = pt.y;
+            
+            if (!_drawing) {
+                // Just update cursor position and re-render
+                if (_cursorVisible) {
+                    renderOverlay();
+                }
+                return;
+            }
+            
+            if (_drawing.type === 'pen') {
+                moveDrawPen(e);
+            } else if (_drawing.type === 'spotlight') {
+                moveDrawSpotlight(e);
+            } else {
+                moveDrawArrow(e);
+            }
+        }
+        
+        function onPointerEnter(e) {
+            _cursorVisible = document.body.dataset.tool !== 'none';
+            if (_cursorVisible) {
+                renderOverlay();
+            }
+        }
+        
+        function onPointerLeave(e) {
+            _cursorVisible = false;
+            renderOverlay();
+        }
+
+        function onPointerUp(e) {
+            if (!_drawing) return;
+            if (_drawing.type === 'pen') {
+                endDrawPen();
+            } else if (_drawing.type === 'spotlight') {
+                endDrawSpotlight();
+            } else {
+                endDrawArrow();
+            }
+        }
+
+        function startDrawArrow(e) {
+            if (document.body.dataset.tool !== 'arrow') return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _drawing = {
+                x1: pt.x + containerScrollLeft(),
+                y1: pt.y + containerScrollTop(),
+                x2: pt.x + containerScrollLeft(),
+                y2: pt.y + containerScrollTop(),
+                color: getArrowColor(),
+                width: getLineThickness()
+            };
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function moveDrawArrow(e) {
+            if (!_drawing) return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _drawing.x2 = pt.x + containerScrollLeft();
+            _drawing.y2 = pt.y + containerScrollTop();
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function endDrawArrow() {
+            if (!_drawing) return;
+            _shapes.push({ 
+                type: 'arrow', 
+                ..._drawing,
+                createdAt: Date.now(),
+                fadeoutTime: getFadeoutTime(),
+                opacity: 1.0
+            });
+            _drawing = null;
+            saveShapes();
+            renderOverlay();
+        }
+
+        function startDrawPen(e) {
+            if (document.body.dataset.tool !== 'pen') return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _drawing = {
+                type: 'pen',
+                points: [{
+                    x: pt.x + containerScrollLeft(),
+                    y: pt.y + containerScrollTop()
+                }],
+                color: getArrowColor(),
+                width: getLineThickness()
+            };
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function moveDrawPen(e) {
+            if (!_drawing || _drawing.type !== 'pen') return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _drawing.points.push({
+                x: pt.x + containerScrollLeft(),
+                y: pt.y + containerScrollTop()
+            });
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function endDrawPen() {
+            if (!_drawing || _drawing.type !== 'pen') return;
+            if (_drawing.points.length > 1) {
+                _shapes.push({ 
+                    ..._drawing,
+                    createdAt: Date.now(),
+                    fadeoutTime: getFadeoutTime(),
+                    opacity: 1.0
+                });
+            }
+            _drawing = null;
+            saveShapes();
+            renderOverlay();
+        }
+
+        function startDrawSpotlight(e) {
+            if (document.body.dataset.tool !== 'spotlight') return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _drawing = {
+                type: 'spotlight',
+                x: pt.x + containerScrollLeft(),
+                y: pt.y + containerScrollTop(),
+                radius: getLineThickness() * 20, // Use thickness to control spotlight size (bigger default)
+                color: getArrowColor()
+            };
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function moveDrawSpotlight(e) {
+            if (!_drawing || _drawing.type !== 'spotlight') return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            const dx = pt.x + containerScrollLeft() - _drawing.x;
+            const dy = pt.y + containerScrollTop() - _drawing.y;
+            _drawing.radius = Math.max(20, Math.sqrt(dx * dx + dy * dy)); // Minimum radius of 20
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function endDrawSpotlight() {
+            if (!_drawing || _drawing.type !== 'spotlight') return;
+            _shapes.push({ 
+                ..._drawing,
+                createdAt: Date.now(),
+                fadeoutTime: getFadeoutTime(),
+                opacity: 1.0
+            });
+            _drawing = null;
+            saveShapes();
+            renderOverlay();
+        }
+
+        function distPointToSegment(px, py, x1, y1, x2, y2) {
+            const dx = x2 - x1, dy = y2 - y1;
+            if (dx === 0 && dy === 0) return Math.hypot(px - x1, py - y1);
+            const t = Math.max(0, Math.min(1, ((px - x1) * dx + (py - y1) * dy) / (dx*dx + dy*dy)));
+            const cx = x1 + t * dx, cy = y1 + t * dy;
+            return Math.hypot(px - cx, py - cy);
+        }
+
+        function eraseAt(e) {
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            const x = pt.x + containerScrollLeft();
+            const y = pt.y + containerScrollTop();
+            const threshold = 10; // pixels
+            for (let i = _shapes.length - 1; i >= 0; i--) {
+                const s = _shapes[i];
+                if (s.type === 'arrow') {
+                    const d = distPointToSegment(x, y, s.x1, s.y1, s.x2, s.y2);
+                    if (d <= threshold) {
+                        _shapes.splice(i, 1);
+                        saveShapes();
+                        renderOverlay();
+                        break;
+                    }
+                } else if (s.type === 'pen' && s.points) {
+                    // Check if click is near any line segment in the pen stroke
+                    let minDist = Infinity;
+                    for (let j = 1; j < s.points.length; j++) {
+                        const d = distPointToSegment(x, y, s.points[j-1].x, s.points[j-1].y, s.points[j].x, s.points[j].y);
+                        minDist = Math.min(minDist, d);
+                    }
+                    if (minDist <= threshold) {
+                        _shapes.splice(i, 1);
+                        saveShapes();
+                        renderOverlay();
+                        break;
+                    }
+                }
+            }
+            e.preventDefault();
+        }
+
+        function drawArrow(ctx, x1, y1, x2, y2, color, width, opacity = 1.0) {
+            // Set opacity
+            const oldAlpha = ctx.globalAlpha;
+            ctx.globalAlpha = opacity;
+            
+            ctx.strokeStyle = color;
+            ctx.fillStyle = color;
+            ctx.lineWidth = width;
+            ctx.lineCap = 'round';
+            ctx.lineJoin = 'round';
+            
+            // Check if points are too close (initial state)
+            const dx = x2 - x1;
+            const dy = y2 - y1;
+            const distance = Math.sqrt(dx * dx + dy * dy);
+            
+            if (distance < 5) {
+                // Draw just a small arrowhead pointing down-right when first clicked
+                const defaultAngle = Math.PI / 4; // 45 degrees (down-right)
+                const headLength = Math.min(15 + width * 1.5, 25);
+                const headAngle = Math.PI / 6;
+                
+                // Calculate arrowhead points
+                const hx1 = x1 + headLength * Math.cos(defaultAngle - headAngle);
+                const hy1 = y1 + headLength * Math.sin(defaultAngle - headAngle);
+                const hx2 = x1 + headLength * Math.cos(defaultAngle + headAngle);
+                const hy2 = y1 + headLength * Math.sin(defaultAngle + headAngle);
+                
+                // Draw arrowhead only
+                ctx.beginPath();
+                ctx.moveTo(x1, y1);
+                ctx.lineTo(hx1, hy1);
+                ctx.lineTo(hx2, hy2);
+                ctx.closePath();
+                ctx.fill();
+            } else {
+                // Normal arrow drawing - head at x1,y1, tail at x2,y2
+                const angle = Math.atan2(y1 - y2, x1 - x2);
+                const headLength = Math.min(15 + width * 1.5, 25);
+                const headAngle = Math.PI / 6;
+                
+                // Calculate where the line should end (before the arrowhead)
+                const lineEndX = x1 - headLength * 0.8 * Math.cos(angle);
+                const lineEndY = y1 - headLength * 0.8 * Math.sin(angle);
+                
+                // Draw the line from tail to near the head
+                ctx.beginPath();
+                ctx.moveTo(x2, y2);
+                ctx.lineTo(lineEndX, lineEndY);
+                ctx.stroke();
+                
+                // Calculate arrowhead points
+                const hx1 = x1 - headLength * Math.cos(angle - headAngle);
+                const hy1 = y1 - headLength * Math.sin(angle - headAngle);
+                const hx2 = x1 - headLength * Math.cos(angle + headAngle);
+                const hy2 = y1 - headLength * Math.sin(angle + headAngle);
+                
+                // Draw arrowhead
+                ctx.beginPath();
+                ctx.moveTo(x1, y1);
+                ctx.lineTo(hx1, hy1);
+                ctx.lineTo(hx2, hy2);
+                ctx.closePath();
+                ctx.fill();
+            }
+            
+            // Restore opacity
+            ctx.globalAlpha = oldAlpha;
+        }
+
+        function drawPen(ctx, points, color, width, offX, offY, opacity = 1.0) {
+            if (!points || points.length < 2) return;
+            
+            // Set opacity
+            const oldAlpha = ctx.globalAlpha;
+            ctx.globalAlpha = opacity;
+            
+            ctx.strokeStyle = color;
+            ctx.lineWidth = width;
+            ctx.lineCap = 'round';
+            ctx.lineJoin = 'round';
+            ctx.beginPath();
+            ctx.moveTo(points[0].x - offX, points[0].y - offY);
+            for (let i = 1; i < points.length; i++) {
+                ctx.lineTo(points[i].x - offX, points[i].y - offY);
+            }
+            ctx.stroke();
+            
+            // Restore opacity
+            ctx.globalAlpha = oldAlpha;
+        }
+
+        function drawAllSpotlights(ctx, spotlights, offX, offY) {
+            if (!spotlights || spotlights.length === 0) return;
+            
+            ctx.save();
+            
+            // Calculate the overall opacity based on all spotlights
+            const maxOpacity = Math.max(...spotlights.map(s => s.opacity || 1.0));
+            
+            // Fill entire canvas with dark overlay
+            ctx.fillStyle = `rgba(0, 0, 0, ${0.7 * maxOpacity})`;
+            ctx.fillRect(0, 0, ctx.canvas.width, ctx.canvas.height);
+            
+            // Cut out completely transparent holes for all spotlights
+            ctx.globalCompositeOperation = 'destination-out';
+            ctx.fillStyle = 'rgba(0, 0, 0, 1)'; // Solid black to ensure complete removal
+            for (const spotlight of spotlights) {
+                ctx.beginPath();
+                ctx.arc(spotlight.x - offX, spotlight.y - offY, spotlight.radius, 0, 2 * Math.PI);
+                ctx.fill();
+            }
+            
+            ctx.restore();
+        }
+
+        function renderOverlay() {
+            if (!_overlay || !_overlayCtx) return;
+            _overlayCtx.clearRect(0, 0, _overlay.width, _overlay.height);
+            const offX = containerScrollLeft();
+            const offY = containerScrollTop();
+            // Draw non-spotlight shapes first
+            for (const s of _shapes) {
+                const opacity = s.opacity !== undefined ? s.opacity : 1.0;
+                if (s.type === 'arrow') {
+                    drawArrow(_overlayCtx, s.x1 - offX, s.y1 - offY, s.x2 - offX, s.y2 - offY, s.color || '#f00', s.width || 2, opacity);
+                } else if (s.type === 'pen') {
+                    drawPen(_overlayCtx, s.points, s.color || '#f00', s.width || 2, offX, offY, opacity);
+                }
+            }
+            // Draw current drawing (non-spotlight)
+            if (_drawing) {
+                if (_drawing.type === 'pen') {
+                    drawPen(_overlayCtx, _drawing.points, _drawing.color, _drawing.width, offX, offY);
+                } else if (_drawing.type !== 'spotlight') {
+                    drawArrow(_overlayCtx, _drawing.x1 - offX, _drawing.y1 - offY, _drawing.x2 - offX, _drawing.y2 - offY, _drawing.color, _drawing.width);
+                }
+            }
+            
+            // Collect all spotlights (existing + current drawing + cursor preview)
+            const spotlights = [];
+            
+            // Add existing spotlight shapes
+            for (const s of _shapes) {
+                if (s.type === 'spotlight') {
+                    spotlights.push({
+                        x: s.x,
+                        y: s.y, 
+                        radius: s.radius,
+                        opacity: s.opacity !== undefined ? s.opacity : 1.0
+                    });
+                }
+            }
+            
+            // Add current spotlight being drawn
+            if (_drawing && _drawing.type === 'spotlight') {
+                spotlights.push({
+                    x: _drawing.x,
+                    y: _drawing.y,
+                    radius: _drawing.radius,
+                    opacity: 1.0
+                });
+            }
+            
+            // Add cursor preview spotlight if tool is active
+            if (_cursorVisible && !_drawing) {
+                const tool = document.body.dataset.tool;
+                if (tool === 'spotlight') {
+                    const thickness = getLineThickness();
+                    const radius = thickness * 20;
+                    const cursorWorldX = _cursorX + containerScrollLeft();
+                    const cursorWorldY = _cursorY + containerScrollTop();
+                    spotlights.push({
+                        x: cursorWorldX,
+                        y: cursorWorldY,
+                        radius: radius,
+                        opacity: 0.8
+                    });
+                }
+            }
+            
+            // Draw all spotlights as a single overlay with multiple holes
+            drawAllSpotlights(_overlayCtx, spotlights, offX, offY);
+            
+            // Draw cursor indicators for non-spotlight tools
+            if (_cursorVisible && !_drawing) {
+                const tool = document.body.dataset.tool;
+                const color = getArrowColor();
+                const thickness = getLineThickness();
+                
+                if (tool !== 'spotlight') {
+                    _overlayCtx.save();
+                    _overlayCtx.fillStyle = color;
+                    _overlayCtx.globalAlpha = 0.7;
+                    
+                    if (tool === 'eraser') {
+                        // Draw eraser indicator
+                        _overlayCtx.strokeStyle = color;
+                        _overlayCtx.lineWidth = 2;
+                        _overlayCtx.beginPath();
+                        _overlayCtx.arc(_cursorX, _cursorY, 10, 0, 2 * Math.PI);
+                        _overlayCtx.stroke();
+                    } else {
+                        // Draw dot for pen/arrow
+                        _overlayCtx.beginPath();
+                        _overlayCtx.arc(_cursorX, _cursorY, thickness / 2, 0, 2 * Math.PI);
+                        _overlayCtx.fill();
+                    }
+                    
+                    _overlayCtx.restore();
+                }
+            }
+        }
+
+        function setOverlayActive(active) {
+            if (!_overlay) initOverlay();
+            _overlay.style.pointerEvents = active ? 'auto' : 'none';
+            _overlay.style.cursor = active ? 'none' : 'auto';
+            // Re-render to ensure visibility aligns with content
+            renderOverlay();
+        }
+
+        function initOverlay() {
+            if (_overlay) return;
+            updateOverlayModeAndContainer();
+            _overlay = document.createElement('canvas');
+            _overlay.className = 'draw-overlay';
+            _overlayCtx = _overlay.getContext('2d');
+            document.body.appendChild(_overlay);
+            updateOverlayBounds();
+            loadShapes();
+            renderOverlay();
+
+            // Events
+            _overlay.addEventListener('mousedown', onPointerDown);
+            _overlay.addEventListener('mousemove', onPointerMove);
+            _overlay.addEventListener('mouseenter', onPointerEnter);
+            _overlay.addEventListener('mouseleave', onPointerLeave);
+            document.addEventListener('mouseup', onPointerUp);
+            _overlay.addEventListener('touchstart', onPointerDown, { passive: false });
+            _overlay.addEventListener('touchmove', onPointerMove, { passive: false });
+            document.addEventListener('touchend', onPointerUp);
+
+            _overlayResizeHandler = () => updateOverlayBounds();
+            window.addEventListener('resize', _overlayResizeHandler);
+
+            _overlayScrollHandler = () => renderOverlay();
+            window.addEventListener('scroll', _overlayScrollHandler);
+            
+            // Start fade animation timer
+            _fadeTimer = setInterval(updateShapesFade, 100); // Update every 100ms for smooth fade
+        }
+
+        function rebindOverlayContainer() {
+            if (!_overlay) return;
+            // Remove old scroll handler
+            if (_overlayScrollHandler) { window.removeEventListener('scroll', _overlayScrollHandler); }
+            updateOverlayModeAndContainer();
+            updateOverlayBounds();
+            loadShapes();
+            renderOverlay();
+            _overlayScrollHandler = () => renderOverlay();
+            window.addEventListener('scroll', _overlayScrollHandler);
+        }
+
+        function teardownOverlay() {
+            if (!_overlay) return;
+            _overlay.removeEventListener('mousedown', onPointerDown);
+            _overlay.removeEventListener('mousemove', onPointerMove);
+            _overlay.removeEventListener('mouseenter', onPointerEnter);
+            _overlay.removeEventListener('mouseleave', onPointerLeave);
+            document.removeEventListener('mouseup', onPointerUp);
+            _overlay.removeEventListener('touchstart', onPointerDown);
+            _overlay.removeEventListener('touchmove', onPointerMove);
+            document.removeEventListener('touchend', onPointerUp);
+            if (_overlayResizeHandler) window.removeEventListener('resize', _overlayResizeHandler);
+            if (_overlayScrollHandler) {
+                if (_overlayContainer === window) {
+                    window.removeEventListener('scroll', _overlayScrollHandler);
+                } else if (_overlayContainer) {
+                    _overlayContainer.removeEventListener('scroll', _overlayScrollHandler);
+                }
+            }
+            if (_fadeTimer) {
+                clearInterval(_fadeTimer);
+                _fadeTimer = null;
+            }
+            if (_overlay.parentNode) _overlay.parentNode.removeChild(_overlay);
+            _overlay = null; _overlayCtx = null; _overlayContainer = null; _overlayResizeHandler = null; _overlayScrollHandler = null; _drawing = null;
+        }
+        
+        function teardownFileExplorer() {
+            const fe = document.querySelector('.file-explorer');
+            if (fe && fe.parentNode) fe.parentNode.removeChild(fe);
+        }
+
+        function escapeHtml(text) {
+            const div = document.createElement('div');
+            div.textContent = text;
+            return div.innerHTML;
+        }
+
+        function runCell(cellId){
+            const btn=document.querySelector('.run-btn[onclick*="'+cellId+'"]');
+            const output=document.getElementById('output-'+cellId);
+            if(btn){btn.textContent='⏳ running...';btn.disabled=true;}
+            if(output){output.classList.add('output-stale');}
+            fetch('/run/'+cellId,{method:'POST'}).then(r=>r.json()).then(data=>{
+                if(output){
+                    output.classList.remove('output-stale');
+                    let html='';
+                    if (data.stdout) {
+                    html += '<div class="cell-stdout"><pre class="stdout-text">'
+                        + escapeHtml(data.stdout)
+                        + '</pre></div>';
+                    }
+
+                    console.log('UV Logs:', data);
+                    if(data.stderr) {
+                        // Split UV logs from regular stderr
+                        const lines = data.stderr.split('\\n');
+                        let uvLogs = [];
+                        let regularLogs = [];
+                        let inUvSection = true;
+
+                        for (const line of lines) {
+                            if (inUvSection) {
+                                uvLogs.push(line);
+                                if (line.startsWith('Installed ')) {
+                                    inUvSection = false;
+                                }
+                            } else {
+                                regularLogs.push(line);
+                            }
+                        }
+                        
+
+                        // If we never found "Installed", treat it all as regular stderr
+                        if (inUvSection) {
+                            html+='<div class="cell-stderr">'+escapeHtml(data.stderr)+'</div>';
+                        } else {
+                            const uvLogsStr = uvLogs.join('\\n');
+                            const regularLogsStr = regularLogs.join('\\n').trim();
+
+                            if (uvLogsStr) {
+                                html+='<div class="uv-install-logs">';
+                                html+='<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>';
+                                html+='<div class="uv-logs-content" style="display: none;">'+escapeHtml(uvLogsStr)+'</div>';
+                                html+='</div>';
+                            }
+                            if (regularLogsStr) {
+                                html+='<div class="cell-stderr">'+escapeHtml(regularLogsStr)+'</div>';
+                            }
+                        }
+                    }
+                    output.innerHTML=html;
+                }
+                if(btn){btn.textContent='▶ run';btn.disabled=false;}
+            }).catch(e=>{
+                console.error('Run failed:',e);
+                if(output){output.classList.remove('output-stale');}
+                if(btn){btn.textContent='▶ run';btn.disabled=false;}
+            });
+        }
+
+        function copyCell(cellId){
+            // Try multiple selectors to find the code element
+            // Pygments generates .highlight > pre with spans, not wrapped in <code>
+            let codeElement = document.querySelector('#code-'+cellId+' .highlight pre');
+            if (!codeElement) {
+                codeElement = document.querySelector('#code-'+cellId+' pre');
+            }
+            if (!codeElement) {
+                codeElement = document.querySelector('#code-'+cellId+' code');
+            }
+            if (!codeElement) {
+                // Fallback to the code div itself
+                codeElement = document.getElementById('code-'+cellId);
+            }
+
+            const btn = document.querySelector('.copy-btn[onclick*="'+cellId+'"]');
+
+            if (!codeElement) {
+                console.error('Code element not found for cell:', cellId);
+                return;
+            }
+            if (!btn) {
+                console.error('Copy button not found for cell:', cellId);
+                return;
+            }
+
+            const codeText = codeElement.textContent;
+            
+            if (navigator.clipboard && navigator.clipboard.writeText) {
+                navigator.clipboard.writeText(codeText).then(function() {
+                    console.log('Clipboard copy successful');
+                    btn.textContent = '✓ Copied!';
+                    btn.classList.add('copied');
+                    setTimeout(function() {
+                        btn.textContent = 'Copy';
+                        btn.classList.remove('copied');
+                    }, 2000);
+                }).catch(function(err) {
+                    console.warn('Clipboard copy failed:', err);
+                    fallbackCopy();
+                });
+            } else {
+                console.log('Using fallback copy method');
+                fallbackCopy();
+            }
+            
+            function fallbackCopy() {
+                const textarea = document.createElement('textarea');
+                textarea.value = codeText;
+                textarea.style.position = 'absolute';
+                textarea.style.left = '-9999px';
+                document.body.appendChild(textarea);
+                textarea.select();
+                try {
+                    const success = document.execCommand('copy');
+                    console.log('Fallback copy success:', success);
+                    btn.textContent = '✓ Copied!';
+                    btn.classList.add('copied');
+                    setTimeout(function() {
+                        btn.textContent = 'Copy';
+                        btn.classList.remove('copied');
+                    }, 2000);
+                } catch (err) {
+                    console.error('Fallback copy failed:', err);
+                    btn.textContent = 'Copy failed';
+                    setTimeout(function() {
+                        btn.textContent = 'Copy';
+                    }, 2000);
+                }
+                document.body.removeChild(textarea);
+            }
+        }
+
+        // // Live reload functionality (robust SSE handling)
+        // (function(){
+        //     if (!('EventSource' in window)) {
+        //         console.warn('SSE not supported in this browser');
+        //         return;
+        //     }
+        //     let source = new EventSource('/events');
+        //     let isOpen = false;
+        //     source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
+        //     source.onmessage = function(e){
+        //         const msg=(e.data||'').trim(); if(!msg) return;
+        //         console.log('SSE message:', msg);
+        //         if (msg==='reload' || msg==='incremental') { location.reload(); }
+        //         // Ignore 'loading' to avoid premature reload loops
+        //     };
+        //     source.onerror = function(e){
+        //         // Let EventSource auto-reconnect instead of forcing a reload
+        //         if (isOpen) console.warn('SSE error after open, retrying...', e);
+        //     };
+        //     window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
+        // })();
+
+
+        document.addEventListener('DOMContentLoaded', function() {
+            // Capture initial cell states before any modifications
+            captureInitialCellStates();
+            
+            updateThemeIcon();
+            updateUiMenu();
+            updateUiDebug();
+            const widgetsEnabled = (document.documentElement.getAttribute('data-widgets') || 'on') === 'on';
+            if (widgetsEnabled) {
+            initMinimap();
+            initFileExplorer();
+            initTools();
+            initOverlay();
+            initStatusWidget();
+            initializeWidgetVisibility();
+            layoutWidgetsStackedBottomRight();
+            window.addEventListener('resize', layoutWidgetsStackedBottomRight);
+            }
+
+            // Apply deep-link selection if present
+            applyLocationFromUrl();
+            updateStateIndicator();
+            
+            // Apply cell states from URL immediately
+            const url = new URL(window.location.href);
+            const encodedCellStates = url.searchParams.get('cells');
+            if (encodedCellStates) {
+                console.log('Applying cell states from URL...');
+                const cellStates = decodeCellStatesFromUrl(encodedCellStates);
+                
+                // Use requestAnimationFrame to ensure DOM is ready
+                requestAnimationFrame(() => {
+                    applyCellStatesFromUrl(cellStates);
+                    
+                    // Clear initialization flag after cell states are applied
+                    if (typeof _isInitializing !== 'undefined') {
+                        _isInitializing = false;
+                    }
+                });
+            } else {
+                // Clear initialization flag even if no cell states
+                if (typeof _isInitializing !== 'undefined') {
+                    requestAnimationFrame(() => {
+                        _isInitializing = false;
+                    });
+                }
+            }
+
+            // Bind drag selection on line numbers
+            document.addEventListener('mousedown', onLineNumberMouseDown);
+            document.addEventListener('mousemove', onDocMouseMove);
+            document.addEventListener('mouseup', onDocMouseUp);
+
+            // Add ESC key handler to exit tools
+            document.addEventListener('keydown', function(e) {
+                if (e.key === 'Escape' || e.keyCode === 27) {
+                    const currentTool = document.body.dataset.tool;
+                    if (currentTool && currentTool !== 'none') {
+                        // Deactivate the current tool
+                        window.setActiveTool('none');
+                    }
+                    // Also clear any active line selection
+                    clearSelection(true);
+                }
+            });
+        });
+
+        function updateStateIndicator() {
+            try {
+                const el = document.getElementById('status-widget');
+                if (!el) return;
+                const tool = document.body.dataset.tool || 'none';
+                if (tool && tool !== 'none') {
+                    el.textContent = `tool: ${tool} — Esc`;
+                    return;
+                }
+                if (_selection) {
+                    const t = _selection.a === _selection.b ? `L${_selection.a}` : `L${_selection.a}-${_selection.b}`;
+                    el.textContent = `selected: ${t} — Esc`;
+                    return;
+                }
+                el.textContent = 'ready — Esc';
+            } catch (_) {}
+        }
+    </script>
 </head>
+
+
 <body>
-  <div class='controls'>
-    <a href='../index.html' class='back-button'>← back</a>
-  </div>
-  <h1>Index of /rotary</h1>
-  <ul>
-    <li><a href='impls/index.html' class='dir'>impls/</a></li>
-    <li><a href='results/index.html' class='dir'>results/</a></li>
-  </ul>
+    <div class="controls">
+        <div class="controls-buttons">
+            
+            <a href="index.html" class="back-button">← back</a>
+            
+            <div class="theme-toggle" onclick="toggleTheme()">light</div>
+            <div class="reset-toggle" onclick="resetLayout()">reset</div>
+            <div class="menu-button" onclick="toggleMenu()">
+                menu ▼
+                <div class="menu-dropdown">
+                    <div class="menu-item" onclick="setUiTheme('default')">
+                        <span class="menu-checkbox" id="checkbox-ui-default">☑</span> Theme: default
+                    </div>
+                    <div class="menu-item" onclick="setUiTheme('none')">
+                        <span class="menu-checkbox" id="checkbox-ui-none">☐</span> Theme: none
+                    </div>
+                    <div class="menu-item" onclick="setUiTheme('monocolor')">
+                        <span class="menu-checkbox" id="checkbox-ui-monocolor">☐</span> Theme: monocolor
+                    </div>
+                    <div class="menu-item" onclick="toggleWidget('tools')">
+                        <span class="menu-checkbox" id="checkbox-tools">☐</span> Tools
+                    </div>
+                    <div class="menu-item" onclick="toggleWidget('file-explorer')">
+                        <span class="menu-checkbox" id="checkbox-file-explorer">☐</span> File Explorer
+                    </div>
+                    <div class="menu-item" onclick="toggleWidget('minimap')">
+                        <span class="menu-checkbox" id="checkbox-minimap">☐</span> Table of Contents
+                    </div>
+                    <div class="menu-item" onclick="toggleWidget('status')">
+                        <span class="menu-checkbox" id="checkbox-status">☑</span> Status Indicator
+                    </div>
+                </div>
+            </div>
+        </div>
+    </div>
+    
+    <div class="system-info">
+        <div class="system-info-header">Generated on:</div>
+        <div class="system-info-content">
+            Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
+        </div>
+    </div>
+    
+    <div class="main-content">
+        <h1>Rotary Position Embeddings Benchmarks</h1>
+<p>This directory contains benchmarks for Rotary Position Embeddings (RoPE) implementations.</p>
+<h2>Implementations</h2>
+<ul>
+<li><a href="impls/hf_kernels_rotary.html">HF Kernels Rotary</a> - HuggingFace kernels implementation</li>
+<li><a href="impls/torch_rotary.html">PyTorch Rotary</a> - PyTorch native implementation</li>
+</ul>
+<h2>Results</h2>
+<ul>
+<li><a href="results/combined_results.html">Combined Results</a> - Aggregated benchmark results with visualizations</li>
+</ul>
+    </div>
+    
 </body>
 </html>
\ No newline at end of file
diff --git a/rotary/results/artifacts/combine/latency.svg b/rotary/results/artifacts/combine/latency.svg
index 793d43c4ad9f51efa85fd8e3504aaff6f6bbc3ad..3fdefb46544d73b9bc85fc2ae3e00add87b86535 100644
--- a/rotary/results/artifacts/combine/latency.svg
+++ b/rotary/results/artifacts/combine/latency.svg
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0517a426384d0bc9df1932ace04595ea1867cb036e7fbeced61eb044cff2e335
+oid sha256:36e71e631ab1a00097df3bc72a4532b4b383ed31a1df2368bd041e765254a9c3
 size 31018
diff --git a/rotary/results/combined_results.html b/rotary/results/combined_results.html
index a1cdc49d1fc3934c88244cb81845c6ffb97c9784..17475d0e65452d0f310ef38d60c5c80c88e6833b 100644
--- a/rotary/results/combined_results.html
+++ b/rotary/results/combined_results.html
@@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
   <rdf:RDF>
    <ns2:Work>
     <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
-    <dc:date>2025-10-28T14:09:08.848427</dc:date>
+    <dc:date>2025-10-29T14:27:54.393501</dc:date>
     <dc:format>image/svg+xml</dc:format>
     <dc:creator>
      <ns2:Agent>
@@ -4216,70 +4216,70 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
    <g id="matplotlib.axis_2">
     <g id="ytick_1">
      <g id="grid-y--2" class="grid grid-y">
-      <path d="M 47.72 384.19215  L 823.142937 384.19215  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 385.895403  L 823.142937 385.895403  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_25">
       <defs>
        <path id="m0fca2865ba" d="M 0 0  L -3.5 0  " style="stroke: #000000; stroke-width: 0.8" />
       </defs>
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="384.19215" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="385.895403" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_25">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="387.991369" transform="rotate(-0 40.72 387.991369)">0.2</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="389.694621" transform="rotate(-0 40.72 389.694621)">0.2</text>
      </g>
     </g>
     <g id="ytick_2">
      <g id="grid-y--3" class="grid grid-y">
-      <path d="M 47.72 306.653539  L 823.142937 306.653539  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 308.195371  L 823.142937 308.195371  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_26">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="306.653539" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="308.195371" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_26">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="310.452758" transform="rotate(-0 40.72 310.452758)">0.3</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="311.99459" transform="rotate(-0 40.72 311.99459)">0.3</text>
      </g>
     </g>
     <g id="ytick_3">
      <g id="grid-y--4" class="grid grid-y">
-      <path d="M 47.72 229.114927  L 823.142937 229.114927  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 230.49534  L 823.142937 230.49534  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_27">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="229.114927" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="230.49534" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_27">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="232.914146" transform="rotate(-0 40.72 232.914146)">0.4</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="234.294559" transform="rotate(-0 40.72 234.294559)">0.4</text>
      </g>
     </g>
     <g id="ytick_4">
      <g id="grid-y--5" class="grid grid-y">
-      <path d="M 47.72 151.576316  L 823.142937 151.576316  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 152.795309  L 823.142937 152.795309  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_28">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="151.576316" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="152.795309" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_28">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="155.375535" transform="rotate(-0 40.72 155.375535)">0.5</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="156.594528" transform="rotate(-0 40.72 156.594528)">0.5</text>
      </g>
     </g>
     <g id="ytick_5">
      <g id="grid-y--6" class="grid grid-y">
-      <path d="M 47.72 74.037704  L 823.142937 74.037704  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 75.095278  L 823.142937 75.095278  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_29">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="74.037704" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="75.095278" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_29">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="77.836923" transform="rotate(-0 40.72 77.836923)">0.6</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="78.894497" transform="rotate(-0 40.72 78.894497)">0.6</text>
      </g>
     </g>
     <g id="label--y" class="ylabel">
@@ -4287,34 +4287,34 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
     </g>
    </g>
    <g id="series--torch-eager" class="series">
-    <path d="M 82.966497 405.060892  L 113.615625 361.374088  L 144.264753 368.740256  L 174.913881 368.958139  L 205.563009 371.664237  L 236.212137 372.082945  L 266.861265 372.113961  L 297.510393 370.059188  L 328.159521 369.570694  L 358.808648 370.368567  L 389.457776 370.997405  L 420.106904 364.940864  L 450.756032 370.21349  L 481.40516 369.546657  L 512.054288 369.84208  L 542.703416 370.439127  L 573.352544 368.570446  L 604.001672 369.011641  L 634.6508 368.655739  L 665.299928 368.896108  L 695.949056 369.864566  L 726.598184 370.345305  L 757.247312 360.808056  L 787.896439 44.888614  " clip-path="url(#p088c925177)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 82.966497 405.060892  L 113.615625 365.214762  L 144.264753 368.882204  L 174.913881 373.894633  L 205.563009 367.32121  L 236.212137 373.482823  L 266.861265 373.661533  L 297.510393 372.596265  L 328.159521 372.666195  L 358.808648 373.350733  L 389.457776 373.203103  L 420.106904 366.263713  L 450.756032 372.286242  L 481.40516 372.729132  L 512.054288 373.008852  L 542.703416 372.107532  L 573.352544 372.022062  L 604.001672 374.842573  L 634.6508 372.153375  L 665.299928 372.395022  L 695.949056 373.093546  L 726.598184 372.782745  L 757.247312 362.519348  L 787.896439 44.888614  " clip-path="url(#p088c925177)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="md7efaf3aec" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #1f77b4" />
     </defs>
     <g clip-path="url(#p088c925177)">
      <use ns4:href="#md7efaf3aec" x="82.966497" y="405.060892" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="113.615625" y="361.374088" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="144.264753" y="368.740256" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="174.913881" y="368.958139" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="205.563009" y="371.664237" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="236.212137" y="372.082945" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="266.861265" y="372.113961" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="297.510393" y="370.059188" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="328.159521" y="369.570694" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="358.808648" y="370.368567" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="389.457776" y="370.997405" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="420.106904" y="364.940864" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="450.756032" y="370.21349" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="481.40516" y="369.546657" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="512.054288" y="369.84208" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="542.703416" y="370.439127" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="573.352544" y="368.570446" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="604.001672" y="369.011641" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="634.6508" y="368.655739" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="665.299928" y="368.896108" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="695.949056" y="369.864566" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="726.598184" y="370.345305" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="757.247312" y="360.808056" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="113.615625" y="365.214762" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="144.264753" y="368.882204" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="174.913881" y="373.894633" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="205.563009" y="367.32121" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="236.212137" y="373.482823" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="266.861265" y="373.661533" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="297.510393" y="372.596265" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="328.159521" y="372.666195" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="358.808648" y="373.350733" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="389.457776" y="373.203103" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="420.106904" y="366.263713" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="450.756032" y="372.286242" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="481.40516" y="372.729132" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="512.054288" y="373.008852" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="542.703416" y="372.107532" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="573.352544" y="372.022062" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="604.001672" y="374.842573" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="634.6508" y="372.153375" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="665.299928" y="372.395022" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="695.949056" y="373.093546" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="726.598184" y="372.782745" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="757.247312" y="362.519348" style="fill: #1f77b4; stroke: #1f77b4" />
      <use ns4:href="#md7efaf3aec" x="787.896439" y="44.888614" style="fill: #1f77b4; stroke: #1f77b4" />
     </g>
    </g>
@@ -4364,7 +4364,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
 <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
 </span> | 
-Cell: combine | 4.36s
+Cell: combine | 4.35s
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
 <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -4453,7 +4453,7 @@ COMBINED BENCHMARK SUMMARY
 impl                     wl                  p50(ms)  ok
 hf_kernels_rotary        cuda_B1_S128_H32_D128_R64     0.09  False
 hf_kernels_rotary        cuda_B1_S128_H32_D64_R32     0.09  False
-hf_kernels_rotary        cuda_B1_S128_H8_D128_R64     0.09  False
+hf_kernels_rotary        cuda_B1_S128_H8_D128_R64     0.10  False
 hf_kernels_rotary        cuda_B1_S128_H8_D64_R32     0.08  False
 hf_kernels_rotary        cuda_B1_S2048_H32_D128_R64     0.09  False
 hf_kernels_rotary        cuda_B1_S2048_H32_D64_R32     0.09  False
@@ -4478,8 +4478,8 @@ hf_kernels_rotary        cuda_B2_S512_H8_D64_R32     0.09  False
 torch_eager              cuda_B1_S128_H32_D128_R64     0.22  True
 torch_eager              cuda_B1_S128_H32_D64_R32     0.22  True
 torch_eager              cuda_B1_S128_H8_D128_R64     0.23  True
-torch_eager              cuda_B1_S128_H8_D64_R32     0.17  True
-torch_eager              cuda_B1_S2048_H32_D128_R64     0.22  True
+torch_eager              cuda_B1_S128_H8_D64_R32     0.18  True
+torch_eager              cuda_B1_S2048_H32_D128_R64     0.23  True
 torch_eager              cuda_B1_S2048_H32_D64_R32     0.22  True
 torch_eager              cuda_B1_S2048_H8_D128_R64     0.22  True
 torch_eager              cuda_B1_S2048_H8_D64_R32     0.22  True
@@ -4497,7 +4497,7 @@ torch_eager              cuda_B2_S2048_H8_D128_R64     0.22  True
 torch_eager              cuda_B2_S2048_H8_D64_R32     0.22  True
 torch_eager              cuda_B2_S512_H32_D128_R64     0.22  True
 torch_eager              cuda_B2_S512_H32_D64_R32     0.22  True
-torch_eager              cuda_B2_S512_H8_D128_R64     0.22  True
+torch_eager              cuda_B2_S512_H8_D128_R64     0.21  True
 torch_eager              cuda_B2_S512_H8_D64_R32     0.22  True
 
 GENERATING COMBINED VISUALIZATION
@@ -4518,7 +4518,7 @@ Implementations included:
 <div class="uv-install-logs" id="uv-logs-combine">
 <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 <div class="uv-logs-content" style="display: none;">
-Installed 37 packages in 219ms
+Installed 37 packages in 239ms
 </div>
 </div>
 <div class="cell-artifacts">
@@ -4531,7 +4531,7 @@ Installed 37 packages in 219ms
   <rdf:RDF>
    <ns2:Work>
     <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
-    <dc:date>2025-10-28T14:09:08.848427</dc:date>
+    <dc:date>2025-10-29T14:27:54.393501</dc:date>
     <dc:format>image/svg+xml</dc:format>
     <dc:creator>
      <ns2:Agent>
@@ -4875,70 +4875,70 @@ Installed 37 packages in 219ms
    <g id="matplotlib.axis_2">
     <g id="ytick_1">
      <g id="grid-y--2" class="grid grid-y">
-      <path d="M 47.72 384.19215  L 823.142937 384.19215  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 385.895403  L 823.142937 385.895403  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_25">
       <defs>
        <path id="m0fca2865ba" d="M 0 0  L -3.5 0  " style="stroke: #000000; stroke-width: 0.8" />
       </defs>
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="384.19215" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="385.895403" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_25">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="387.991369" transform="rotate(-0 40.72 387.991369)">0.2</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="389.694621" transform="rotate(-0 40.72 389.694621)">0.2</text>
      </g>
     </g>
     <g id="ytick_2">
      <g id="grid-y--3" class="grid grid-y">
-      <path d="M 47.72 306.653539  L 823.142937 306.653539  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 308.195371  L 823.142937 308.195371  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_26">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="306.653539" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="308.195371" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_26">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="310.452758" transform="rotate(-0 40.72 310.452758)">0.3</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="311.99459" transform="rotate(-0 40.72 311.99459)">0.3</text>
      </g>
     </g>
     <g id="ytick_3">
      <g id="grid-y--4" class="grid grid-y">
-      <path d="M 47.72 229.114927  L 823.142937 229.114927  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 230.49534  L 823.142937 230.49534  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_27">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="229.114927" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="230.49534" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_27">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="232.914146" transform="rotate(-0 40.72 232.914146)">0.4</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="234.294559" transform="rotate(-0 40.72 234.294559)">0.4</text>
      </g>
     </g>
     <g id="ytick_4">
      <g id="grid-y--5" class="grid grid-y">
-      <path d="M 47.72 151.576316  L 823.142937 151.576316  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 152.795309  L 823.142937 152.795309  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_28">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="151.576316" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="152.795309" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_28">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="155.375535" transform="rotate(-0 40.72 155.375535)">0.5</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="156.594528" transform="rotate(-0 40.72 156.594528)">0.5</text>
      </g>
     </g>
     <g id="ytick_5">
      <g id="grid-y--6" class="grid grid-y">
-      <path d="M 47.72 74.037704  L 823.142937 74.037704  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 75.095278  L 823.142937 75.095278  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_29">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="74.037704" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="75.095278" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_29">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="77.836923" transform="rotate(-0 40.72 77.836923)">0.6</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="78.894497" transform="rotate(-0 40.72 78.894497)">0.6</text>
      </g>
     </g>
     <g id="label--y" class="ylabel">
@@ -4946,34 +4946,34 @@ Installed 37 packages in 219ms
     </g>
    </g>
    <g id="series--torch-eager" class="series">
-    <path d="M 82.966497 405.060892  L 113.615625 361.374088  L 144.264753 368.740256  L 174.913881 368.958139  L 205.563009 371.664237  L 236.212137 372.082945  L 266.861265 372.113961  L 297.510393 370.059188  L 328.159521 369.570694  L 358.808648 370.368567  L 389.457776 370.997405  L 420.106904 364.940864  L 450.756032 370.21349  L 481.40516 369.546657  L 512.054288 369.84208  L 542.703416 370.439127  L 573.352544 368.570446  L 604.001672 369.011641  L 634.6508 368.655739  L 665.299928 368.896108  L 695.949056 369.864566  L 726.598184 370.345305  L 757.247312 360.808056  L 787.896439 44.888614  " clip-path="url(#p088c925177)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 82.966497 405.060892  L 113.615625 365.214762  L 144.264753 368.882204  L 174.913881 373.894633  L 205.563009 367.32121  L 236.212137 373.482823  L 266.861265 373.661533  L 297.510393 372.596265  L 328.159521 372.666195  L 358.808648 373.350733  L 389.457776 373.203103  L 420.106904 366.263713  L 450.756032 372.286242  L 481.40516 372.729132  L 512.054288 373.008852  L 542.703416 372.107532  L 573.352544 372.022062  L 604.001672 374.842573  L 634.6508 372.153375  L 665.299928 372.395022  L 695.949056 373.093546  L 726.598184 372.782745  L 757.247312 362.519348  L 787.896439 44.888614  " clip-path="url(#p088c925177)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="md7efaf3aec" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #1f77b4" />
     </defs>
     <g clip-path="url(#p088c925177)">
      <use ns4:href="#md7efaf3aec" x="82.966497" y="405.060892" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="113.615625" y="361.374088" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="144.264753" y="368.740256" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="174.913881" y="368.958139" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="205.563009" y="371.664237" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="236.212137" y="372.082945" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="266.861265" y="372.113961" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="297.510393" y="370.059188" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="328.159521" y="369.570694" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="358.808648" y="370.368567" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="389.457776" y="370.997405" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="420.106904" y="364.940864" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="450.756032" y="370.21349" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="481.40516" y="369.546657" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="512.054288" y="369.84208" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="542.703416" y="370.439127" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="573.352544" y="368.570446" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="604.001672" y="369.011641" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="634.6508" y="368.655739" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="665.299928" y="368.896108" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="695.949056" y="369.864566" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="726.598184" y="370.345305" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="757.247312" y="360.808056" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="113.615625" y="365.214762" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="144.264753" y="368.882204" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="174.913881" y="373.894633" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="205.563009" y="367.32121" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="236.212137" y="373.482823" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="266.861265" y="373.661533" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="297.510393" y="372.596265" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="328.159521" y="372.666195" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="358.808648" y="373.350733" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="389.457776" y="373.203103" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="420.106904" y="366.263713" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="450.756032" y="372.286242" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="481.40516" y="372.729132" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="512.054288" y="373.008852" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="542.703416" y="372.107532" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="573.352544" y="372.022062" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="604.001672" y="374.842573" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="634.6508" y="372.153375" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="665.299928" y="372.395022" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="695.949056" y="373.093546" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="726.598184" y="372.782745" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="757.247312" y="362.519348" style="fill: #1f77b4; stroke: #1f77b4" />
      <use ns4:href="#md7efaf3aec" x="787.896439" y="44.888614" style="fill: #1f77b4; stroke: #1f77b4" />
     </g>
    </g>