diff --git a/activation/impls/artifacts/benchmark/activation.jsonl b/activation/impls/artifacts/benchmark/activation.jsonl
index 7d1a3c651f4f5824ec6a49a44a98836bdbadbb74..e014dc32b9e6c116e3ee3407f5e64ef8451eee67 100644
--- a/activation/impls/artifacts/benchmark/activation.jsonl
+++ b/activation/impls/artifacts/benchmark/activation.jsonl
@@ -1,9 +1,9 @@
-{"ts": "2025-10-30T15:52:35Z", "run": "37230f1ce31641f2b6ebd7aba7f793c9", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.022969999974975508, "p50": 0.023499999997511622, "p90": 0.023961000010785938, "mean": 0.02361460000201987, "iqr": 0.0009899999895424116, "raw_times": [0.022971000021243526, 0.022969999974975508, 0.023961000010785938, 0.023499999997511622, 0.024671000005582755], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03073999999969601, "peak_bytes": 1966080, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
-{"ts": "2025-10-30T15:52:35Z", "run": "37230f1ce31641f2b6ebd7aba7f793c9", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.027540000019143918, "p50": 0.029130999962490023, "p90": 0.03002100004323438, "mean": 0.029014800009008468, "iqr": 0.0016900000332498166, "raw_times": [0.027540000019143918, 0.030051000010189455, 0.03002100004323438, 0.029130999962490023, 0.028331000009984564], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03343999998151048, "peak_bytes": 2621440, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
-{"ts": "2025-10-30T15:52:35Z", "run": "37230f1ce31641f2b6ebd7aba7f793c9", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02829999999676147, "p50": 0.029119999965132592, "p90": 0.03051000004461457, "mean": 0.029939999990347133, "iqr": 0.0019500000689731678, "raw_times": [0.02829999999676147, 0.03051000004461457, 0.033209999969585624, 0.029119999965132592, 0.028559999975641404], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031761000002461515, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
-{"ts": "2025-10-30T15:52:35Z", "run": "37230f1ce31641f2b6ebd7aba7f793c9", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.027531000000635686, "p50": 0.028170999996746104, "p90": 0.028501000031155854, "mean": 0.028293000002577173, "iqr": 0.0008900000239009387, "raw_times": [0.027611000007254916, 0.028170999996746104, 0.029650999977093306, 0.027531000000635686, 0.028501000031155854], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03130000004603062, "peak_bytes": 3932160, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
-{"ts": "2025-10-30T15:52:35Z", "run": "37230f1ce31641f2b6ebd7aba7f793c9", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02755000002707675, "p50": 0.02861000001530556, "p90": 0.028831000008722185, "mean": 0.02867660001584227, "iqr": 0.00023000001192485797, "raw_times": [0.028600999996797327, 0.029791000031309522, 0.028831000008722185, 0.02755000002707675, 0.02861000001530556], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03139000000373926, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
-{"ts": "2025-10-30T15:52:35Z", "run": "37230f1ce31641f2b6ebd7aba7f793c9", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02618100000972845, "p50": 0.027131000024382956, "p90": 0.02731099999664366, "mean": 0.026918799994746223, "iqr": 0.0007610000238855719, "raw_times": [0.02618100000972845, 0.027131000024382956, 0.027420999970217963, 0.02731099999664366, 0.026549999972758087], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03008099997714453, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
-{"ts": "2025-10-30T15:52:35Z", "run": "37230f1ce31641f2b6ebd7aba7f793c9", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.026950999995278835, "p50": 0.02748000002839035, "p90": 0.02804100000730614, "mean": 0.02758480000011332, "iqr": 0.0006300000450210064, "raw_times": [0.026950999995278835, 0.02804100000730614, 0.027410999962285132, 0.02804100000730614, 0.02748000002839035], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03104999996139668, "peak_bytes": 7864320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
-{"ts": "2025-10-30T15:52:35Z", "run": "37230f1ce31641f2b6ebd7aba7f793c9", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.026300000001810986, "p50": 0.02733100001250932, "p90": 0.0275399999623005, "mean": 0.02720039998393986, "iqr": 0.0004789999934473599, "raw_times": [0.02706099996885314, 0.02733100001250932, 0.027769999974225357, 0.0275399999623005, 0.026300000001810986], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03032000000757762, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
-{"ts": "2025-10-30T15:52:35Z", "run": "37230f1ce31641f2b6ebd7aba7f793c9", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02642100002958614, "p50": 0.027860999978202017, "p90": 0.02790100000993334, "mean": 0.027615000010428048, "iqr": 0.00036000000136482413, "raw_times": [0.02642100002958614, 0.028351000025850226, 0.027541000008568517, 0.02790100000993334, 0.027860999978202017], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03163999997468636, "peak_bytes": 20971520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:08Z", "run": "c3623842075144ab92176d6468514bae", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.024230000008174102, "p50": 0.024741000004269154, "p90": 0.025410999967334646, "mean": 0.024872599999525846, "iqr": 0.0011599999538702832, "raw_times": [0.024251000013464363, 0.025730000004386966, 0.024230000008174102, 0.025410999967334646, 0.024741000004269154], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03134100001034312, "peak_bytes": 1966080, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:08Z", "run": "c3623842075144ab92176d6468514bae", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.026611000009779673, "p50": 0.029731000040555955, "p90": 0.03027100001418148, "mean": 0.029349000021738902, "iqr": 0.0009999999974752427, "raw_times": [0.026611000009779673, 0.029731000040555955, 0.030861000027471164, 0.03027100001418148, 0.02927100001670624], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.034871000025304966, "peak_bytes": 2621440, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:08Z", "run": "c3623842075144ab92176d6468514bae", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.027259999967554904, "p50": 0.02879100003383428, "p90": 0.030951000042023225, "mean": 0.029224800016436348, "iqr": 0.0029600000175378227, "raw_times": [0.027991000024485402, 0.031131000014283927, 0.02879100003383428, 0.030951000042023225, 0.027259999967554904], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 0.0323909999906391, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:08Z", "run": "c3623842075144ab92176d6468514bae", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.025391000008312403, "p50": 0.02888100004838634, "p90": 0.029160999986288516, "mean": 0.028055000007043418, "iqr": 0.001839999981712026, "raw_times": [0.025391000008312403, 0.02888100004838634, 0.02952099998765334, 0.029160999986288516, 0.02732100000457649], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031509999985246395, "peak_bytes": 3932160, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:08Z", "run": "c3623842075144ab92176d6468514bae", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.026630000036220736, "p50": 0.027450000004591857, "p90": 0.027921000025799003, "mean": 0.02735460001304091, "iqr": 0.0010800000040944724, "raw_times": [0.026630000036220736, 0.027450000004591857, 0.02684100002170453, 0.027921000025799003, 0.027930999976888415], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03172099997073019, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:08Z", "run": "c3623842075144ab92176d6468514bae", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.025049999976545223, "p50": 0.02733100001250932, "p90": 0.028329999963716546, "mean": 0.02741439998317219, "iqr": 0.0016189999882953998, "raw_times": [0.025049999976545223, 0.029649999987668707, 0.028329999963716546, 0.02733100001250932, 0.026710999975421146], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.029950999987704563, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:08Z", "run": "c3623842075144ab92176d6468514bae", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.028341000017917395, "p50": 0.02927099995986282, "p90": 0.029501000028631097, "mean": 0.02909080000108588, "iqr": 0.0009110000291912002, "raw_times": [0.028341000017917395, 0.02927099995986282, 0.029501000028631097, 0.029750999999578198, 0.028589999999439897], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03009099998507736, "peak_bytes": 7864320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:08Z", "run": "c3623842075144ab92176d6468514bae", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.024770999971224228, "p50": 0.02814099997294761, "p90": 0.028720999978304462, "mean": 0.0278467999919485, "iqr": 0.0007409999511764909, "raw_times": [0.024770999971224228, 0.02798000002712797, 0.028720999978304462, 0.02814099997294761, 0.029621000010138232], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031990999957542954, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:08Z", "run": "c3623842075144ab92176d6468514bae", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.027751000004627713, "p50": 0.028230999987499672, "p90": 0.029471000004832604, "mean": 0.028608800005258672, "iqr": 0.0016500000015184924, "raw_times": [0.028230999987499672, 0.027751000004627713, 0.02782100000331411, 0.02977000002601926, 0.029471000004832604], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.030850999962694914, "peak_bytes": 20971520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
diff --git a/activation/impls/hf_kernels_swiglu.html b/activation/impls/hf_kernels_swiglu.html
index cb9b44ffb1c09312b21e7b7e432e9d78fbf6e49d..812f027418b96fc5dd3cda564134f577079c3349 100644
--- a/activation/impls/hf_kernels_swiglu.html
+++ b/activation/impls/hf_kernels_swiglu.html
@@ -4106,11 +4106,12 @@ body[data-tool="eraser"] .main-content {
 <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> | 
-Cell: nv | 0.23s
+Cell: nv | 0.26s
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
 <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
-<a href="https://github.com/huggingface/kernels-uvnotes/blob/main/activation/impls/hf_kernels_swiglu.md" target="_blank" class="github-btn">GitHub</a>
+<a href="https://github.com/huggingface/kernels-benchmarks/blob/main/activation/impls/hf_kernels_swiglu.md" target="_blank" class="github-btn">GitHub</a>
+<a href="https://huggingface.co/kernels-community/activation" target="_blank" class="hf-btn">🤗 HF</a>
 </div>
 <div id="code-nv" class="cell-code" data-lines="2">
 <div class="code-wrap">
@@ -4122,7 +4123,7 @@ Cell: nv | 0.23s
 </div>
 </div>
 <div id="output-nv" class="cell-output">
-<div class="cell-stdout"><pre class="stdout-text">Thu Oct 30 15:52:16 2025       
+<div class="cell-stdout"><pre class="stdout-text">Fri Oct 31 20:00:17 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -4131,7 +4132,7 @@ Cell: nv | 0.23s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   29C    P0             86W /  350W |       0MiB /  46068MiB |      0%      Default |
+| N/A   33C    P0            108W /  350W |       0MiB /  46068MiB |     88%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -4155,11 +4156,12 @@ Cell: nv | 0.23s
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
 </span> | 
-Cell: benchmark | 4.17s
+Cell: benchmark | 4.19s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
-<a href="https://github.com/huggingface/kernels-uvnotes/blob/main/activation/impls/hf_kernels_swiglu.md" target="_blank" class="github-btn">GitHub</a>
+<a href="https://github.com/huggingface/kernels-benchmarks/blob/main/activation/impls/hf_kernels_swiglu.md" target="_blank" class="github-btn">GitHub</a>
+<a href="https://huggingface.co/kernels-community/activation" target="_blank" class="hf-btn">🤗 HF</a>
 </div>
 <div id="code-benchmark" class="cell-code" data-lines="34">
 <div class="code-wrap">
@@ -4211,17 +4213,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D768
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      78.752us      1953.17%      78.752us      78.752us             1  
-                                      hf_kernels_swiglu         9.29%     160.875us        99.59%       1.725ms       1.725ms       0.000us         0.00%       5.440us       5.440us             1  
-                      _activation_beeaae6::silu_and_mul         1.15%      19.839us        87.61%       1.518ms     505.995us       4.032us       100.00%       5.440us       1.813us             3  
-void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       4.032us       100.00%       4.032us       1.344us             3  
-                                Activity Buffer Request        83.97%       1.455ms        83.97%       1.455ms       1.455ms       1.408us        34.92%       1.408us       1.408us             1  
-                                            aten::empty         2.69%      46.600us         2.69%      46.600us      15.533us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         2.49%      43.201us         2.49%      43.201us      14.400us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.41%       7.161us         0.41%       7.161us       7.161us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us     105.055us      2585.65%     105.055us     105.055us             1  
+                                      hf_kernels_swiglu        11.41%     202.714us        99.64%       1.770ms       1.770ms       0.000us         0.00%       5.471us       5.471us             1  
+                      _activation_beeaae6::silu_and_mul         1.18%      21.050us        84.47%       1.501ms     500.190us       4.063us       100.00%       5.471us       1.824us             3  
+void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       4.063us       100.00%       4.063us       1.354us             3  
+                                Activity Buffer Request        80.70%       1.434ms        80.70%       1.434ms       1.434ms       1.408us        34.65%       1.408us       1.408us             1  
+                                            aten::empty         3.76%      66.772us         3.76%      66.772us      22.257us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         2.58%      45.872us         2.58%      45.872us      15.291us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.36%       6.420us         0.36%       6.420us       6.420us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.733ms
-Self CUDA time total: 4.032us
+Self CPU time total: 1.776ms
+Self CUDA time total: 4.063us
 
 
 
@@ -4231,17 +4233,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      62.528us      1575.81%      62.528us      62.528us             1  
-                                      hf_kernels_swiglu         6.86%     110.833us        99.69%       1.610ms       1.610ms       0.000us         0.00%       5.312us       5.312us             1  
-                      _activation_beeaae6::silu_and_mul         1.31%      21.159us        91.69%       1.481ms     493.565us       3.968us       100.00%       5.312us       1.771us             3  
-void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       3.968us       100.00%       3.968us       1.323us             3  
-                                Activity Buffer Request        88.77%       1.434ms        88.77%       1.434ms       1.434ms       1.344us        33.87%       1.344us       1.344us             1  
-                                            aten::empty         1.14%      18.330us         1.14%      18.330us       6.110us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         1.61%      26.001us         1.61%      26.001us       8.667us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.31%       5.030us         0.31%       5.030us       5.030us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      61.119us      1540.69%      61.119us      61.119us             1  
+                                      hf_kernels_swiglu         6.50%     104.811us        99.67%       1.607ms       1.607ms       0.000us         0.00%       5.279us       5.279us             1  
+                      _activation_beeaae6::silu_and_mul         1.26%      20.331us        91.95%       1.482ms     494.073us       3.967us       100.00%       5.279us       1.760us             3  
+void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       3.967us       100.00%       3.967us       1.322us             3  
+                                Activity Buffer Request        89.13%       1.437ms        89.13%       1.437ms       1.437ms       1.312us        33.07%       1.312us       1.312us             1  
+                                            aten::empty         1.22%      19.632us         1.22%      19.632us       6.544us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.56%      25.120us         1.56%      25.120us       8.373us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.33%       5.360us         0.33%       5.360us       5.360us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.615ms
-Self CUDA time total: 3.968us
+Self CPU time total: 1.612ms
+Self CUDA time total: 3.967us
 
 
 
@@ -4251,17 +4253,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      63.232us      1291.50%      63.232us      63.232us             1  
-                                      hf_kernels_swiglu         6.20%     101.121us        99.70%       1.627ms       1.627ms       0.000us         0.00%       6.528us       6.528us             1  
-                      _activation_beeaae6::silu_and_mul         1.27%      20.780us        92.37%       1.507ms     502.489us       4.896us       100.00%       6.528us       2.176us             3  
-void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       4.896us       100.00%       4.896us       1.632us             3  
-                                Activity Buffer Request        89.54%       1.461ms        89.54%       1.461ms       1.461ms       1.632us        33.33%       1.632us       1.632us             1  
-                                            aten::empty         1.13%      18.440us         1.13%      18.440us       6.147us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         1.56%      25.391us         1.56%      25.391us       8.464us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.30%       4.970us         0.30%       4.970us       4.970us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      63.488us      1288.31%      63.488us      63.488us             1  
+                                      hf_kernels_swiglu         6.89%     111.363us        99.67%       1.611ms       1.611ms       0.000us         0.00%       6.592us       6.592us             1  
+                      _activation_beeaae6::silu_and_mul         1.36%      22.028us        91.47%       1.479ms     492.912us       4.928us       100.00%       6.592us       2.197us             3  
+void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       4.928us       100.00%       4.928us       1.643us             3  
+                                Activity Buffer Request        88.52%       1.431ms        88.52%       1.431ms       1.431ms       1.664us        33.77%       1.664us       1.664us             1  
+                                            aten::empty         1.30%      21.081us         1.30%      21.081us       7.027us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.59%      25.652us         1.59%      25.652us       8.551us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.33%       5.390us         0.33%       5.390us       5.390us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.632ms
-Self CUDA time total: 4.896us
+Self CPU time total: 1.617ms
+Self CUDA time total: 4.928us
 
 
 
@@ -4271,17 +4273,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D768
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      65.664us      1554.55%      65.664us      65.664us             1  
-                                      hf_kernels_swiglu         5.63%     101.442us        99.74%       1.798ms       1.798ms       0.000us         0.00%       5.632us       5.632us             1  
-                      _activation_beeaae6::silu_and_mul         1.18%      21.341us        92.99%       1.677ms     558.850us       4.224us       100.00%       5.632us       1.877us             3  
-void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       4.224us       100.00%       4.224us       1.408us             3  
-                                Activity Buffer Request        79.26%       1.429ms        79.26%       1.429ms       1.429ms       1.408us        33.33%       1.408us       1.408us             1  
-                                            aten::empty         1.12%      20.239us         1.12%      20.239us       6.746us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        12.54%     226.164us        12.54%     226.164us      75.388us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.26%       4.649us         0.26%       4.649us       4.649us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      68.000us      1585.82%      68.000us      68.000us             1  
+                                      hf_kernels_swiglu         5.97%     106.915us        99.70%       1.784ms       1.784ms       0.000us         0.00%       5.760us       5.760us             1  
+                      _activation_beeaae6::silu_and_mul         1.16%      20.770us        92.62%       1.658ms     552.564us       4.288us       100.00%       5.760us       1.920us             3  
+void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       4.288us       100.00%       4.288us       1.429us             3  
+                                Activity Buffer Request        80.58%       1.442ms        80.58%       1.442ms       1.442ms       1.472us        34.33%       1.472us       1.472us             1  
+                                            aten::empty         1.10%      19.770us         1.10%      19.770us       6.590us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        10.88%     194.785us        10.88%     194.785us      64.928us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.30%       5.350us         0.30%       5.350us       5.350us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.803ms
-Self CUDA time total: 4.224us
+Self CPU time total: 1.790ms
+Self CUDA time total: 4.288us
 
 
 
@@ -4291,17 +4293,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      63.968us      1086.23%      63.968us      63.968us             1  
-                                      hf_kernels_swiglu        19.44%      85.062us        98.79%     432.257us     432.257us       0.000us         0.00%       7.874us       7.874us             1  
-                      _activation_beeaae6::silu_and_mul         4.74%      20.731us        74.99%     328.126us     109.375us       5.889us       100.00%       7.874us       2.625us             3  
-void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       5.889us       100.00%       5.889us       1.963us             3  
-                                Activity Buffer Request        29.32%     128.302us        29.32%     128.302us     128.302us       1.985us        33.71%       1.985us       1.985us             1  
-                                            aten::empty         4.36%      19.069us         4.36%      19.069us       6.356us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        40.93%     179.093us        40.93%     179.093us      59.698us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.21%       5.289us         1.21%       5.289us       5.289us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      65.599us      1108.28%      65.599us      65.599us             1  
+                                      hf_kernels_swiglu        18.75%      89.073us        98.88%     469.813us     469.813us       0.000us         0.00%       7.903us       7.903us             1  
+                      _activation_beeaae6::silu_and_mul         4.69%      22.280us        76.20%     362.069us     120.690us       5.919us       100.00%       7.903us       2.634us             3  
+void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       5.919us       100.00%       5.919us       1.973us             3  
+                                Activity Buffer Request        38.23%     181.645us        38.23%     181.645us     181.645us       1.984us        33.52%       1.984us       1.984us             1  
+                                            aten::empty         3.93%      18.671us         3.93%      18.671us       6.224us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        33.28%     158.144us        33.28%     158.144us      52.715us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.12%       5.330us         1.12%       5.330us       5.330us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 437.546us
-Self CUDA time total: 5.889us
+Self CPU time total: 475.143us
+Self CUDA time total: 5.919us
 
 
 
@@ -4311,17 +4313,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      67.167us       867.45%      67.167us      67.167us             1  
-                                      hf_kernels_swiglu         5.97%     103.951us        99.66%       1.736ms       1.736ms       0.000us         0.00%      10.335us      10.335us             1  
-                      _activation_beeaae6::silu_and_mul         1.17%      20.451us        92.57%       1.612ms     537.363us       7.743us       100.00%      10.335us       3.445us             3  
-void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       7.743us       100.00%       7.743us       2.581us             3  
-                                Activity Buffer Request        82.03%       1.429ms        82.03%       1.429ms       1.429ms       2.592us        33.48%       2.592us       2.592us             1  
-                                            aten::empty         1.12%      19.510us         1.12%      19.510us       6.503us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         9.36%     162.983us         9.36%     162.983us      54.328us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.34%       5.970us         0.34%       5.970us       5.970us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      70.207us       906.60%      70.207us      70.207us             1  
+                                      hf_kernels_swiglu         6.12%     106.261us        99.74%       1.733ms       1.733ms       0.000us         0.00%      10.336us      10.336us             1  
+                      _activation_beeaae6::silu_and_mul         1.25%      21.782us        92.41%       1.606ms     535.254us       7.744us       100.00%      10.336us       3.445us             3  
+void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       7.744us       100.00%       7.744us       2.581us             3  
+                                Activity Buffer Request        82.36%       1.431ms        82.36%       1.431ms       1.431ms       2.592us        33.47%       2.592us       2.592us             1  
+                                            aten::empty         1.21%      21.081us         1.21%      21.081us       7.027us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         8.80%     152.893us         8.80%     152.893us      50.964us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.26%       4.511us         0.26%       4.511us       4.511us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.742ms
-Self CUDA time total: 7.743us
+Self CPU time total: 1.738ms
+Self CUDA time total: 7.744us
 
 
 
@@ -4331,17 +4333,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D768
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      67.999us      1036.41%      67.999us      67.999us             1  
-                                      hf_kernels_swiglu         5.88%     101.172us        99.74%       1.716ms       1.716ms       0.000us         0.00%       8.769us       8.769us             1  
-                      _activation_beeaae6::silu_and_mul         1.20%      20.670us        92.73%       1.596ms     531.873us       6.561us       100.00%       8.769us       2.923us             3  
-void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       6.561us       100.00%       6.561us       2.187us             3  
-                                Activity Buffer Request        82.56%       1.421ms        82.56%       1.421ms       1.421ms       2.208us        33.65%       2.208us       2.208us             1  
-                                            aten::empty         1.13%      19.490us         1.13%      19.490us       6.497us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         8.96%     154.233us         8.96%     154.233us      51.411us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.26%       4.490us         0.26%       4.490us       4.490us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      69.214us      1045.06%      69.214us      69.214us             1  
+                                      hf_kernels_swiglu         7.00%     122.783us        99.73%       1.750ms       1.750ms       0.000us         0.00%       8.830us       8.830us             1  
+                      _activation_beeaae6::silu_and_mul         1.22%      21.430us        91.58%       1.607ms     535.694us       6.623us       100.00%       8.830us       2.943us             3  
+void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       6.623us       100.00%       6.623us       2.208us             3  
+                                Activity Buffer Request        81.74%       1.434ms        81.74%       1.434ms       1.434ms       2.207us        33.32%       2.207us       2.207us             1  
+                                            aten::empty         1.15%      20.211us         1.15%      20.211us       6.737us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         8.62%     151.304us         8.62%     151.304us      50.435us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.27%       4.780us         0.27%       4.780us       4.780us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.721ms
-Self CUDA time total: 6.561us
+Self CPU time total: 1.755ms
+Self CUDA time total: 6.623us
 
 
 
@@ -4351,17 +4353,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      63.295us       670.43%      63.295us      63.295us             1  
-                                      hf_kernels_swiglu        23.24%      86.211us        98.67%     366.026us     366.026us       0.000us         0.00%      12.609us      12.609us             1  
-                      _activation_beeaae6::silu_and_mul         5.71%      21.191us        70.40%     261.155us      87.052us       9.441us       100.00%      12.609us       4.203us             3  
-void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       9.441us       100.00%       9.441us       3.147us             3  
-                                Activity Buffer Request        23.85%      88.481us        23.85%      88.481us      88.481us       3.168us        33.56%       3.168us       3.168us             1  
-                                            aten::empty         5.03%      18.660us         5.03%      18.660us       6.220us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        40.84%     151.483us        40.84%     151.483us      50.494us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.33%       4.920us         1.33%       4.920us       4.920us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      65.152us       692.52%      65.152us      65.152us             1  
+                                      hf_kernels_swiglu        21.62%      91.474us        98.93%     418.571us     418.571us       0.000us         0.00%      12.576us      12.576us             1  
+                      _activation_beeaae6::silu_and_mul         4.88%      20.631us        69.03%     292.067us      97.356us       9.408us       100.00%      12.576us       4.192us             3  
+void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       9.408us       100.00%       9.408us       3.136us             3  
+                                Activity Buffer Request        28.63%     121.143us        28.63%     121.143us     121.143us       3.168us        33.67%       3.168us       3.168us             1  
+                                            aten::empty         8.28%      35.030us         8.28%      35.030us      11.677us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        35.52%     150.293us        35.52%     150.293us      50.098us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.07%       4.530us         1.07%       4.530us       4.530us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 370.946us
-Self CUDA time total: 9.441us
+Self CPU time total: 423.101us
+Self CUDA time total: 9.408us
 
 
 
@@ -4371,17 +4373,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      65.342us       500.47%      65.342us      65.342us             1  
-                                      hf_kernels_swiglu        22.94%      96.471us        98.88%     415.727us     415.727us       0.000us         0.00%      17.408us      17.408us             1  
-                      _activation_beeaae6::silu_and_mul         5.11%      21.490us        71.29%     299.725us      99.908us      13.056us       100.00%      17.408us       5.803us             3  
-void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us      13.056us       100.00%      13.056us       4.352us             3  
-                                Activity Buffer Request        30.59%     128.632us        30.59%     128.632us     128.632us       4.352us        33.33%       4.352us       4.352us             1  
-                                            aten::empty         4.65%      19.531us         4.65%      19.531us       6.510us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        35.58%     149.603us        35.58%     149.603us      49.868us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.12%       4.720us         1.12%       4.720us       4.720us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      67.197us       514.72%      67.197us      67.197us             1  
+                                      hf_kernels_swiglu        22.39%      97.642us        98.93%     431.481us     431.481us       0.000us         0.00%      17.439us      17.439us             1  
+                      _activation_beeaae6::silu_and_mul         4.99%      21.781us        71.94%     313.789us     104.596us      13.055us       100.00%      17.439us       5.813us             3  
+void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us      13.055us       100.00%      13.055us       4.352us             3  
+                                Activity Buffer Request        32.48%     141.684us        32.48%     141.684us     141.684us       4.384us        33.58%       4.384us       4.384us             1  
+                                            aten::empty         4.60%      20.050us         4.60%      20.050us       6.683us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        34.47%     150.324us        34.47%     150.324us      50.108us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.07%       4.681us         1.07%       4.681us       4.681us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 420.447us
-Self CUDA time total: 13.056us
+Self CPU time total: 436.162us
+Self CUDA time total: 13.055us
 
 
 impl                     wl                  p50(ms)  ok
@@ -4398,12 +4400,12 @@ hf_kernels_swiglu        cuda_T512_D768         0.03  True
 <div class="uv-install-logs" id="uv-logs-benchmark">
 <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 <div class="uv-logs-content" style="display: none;">
-Installed 15 packages in 13ms
+Installed 15 packages in 15ms
 </div>
 </div>
 <div class="cell-stderr">Fetching 7 files:   0%|          | 0/7 [00:00&lt;?, ?it/s]
-Fetching 7 files:  71%|███████▏  | 5/7 [00:00&lt;00:00, 14.50it/s]
-Fetching 7 files: 100%|██████████| 7/7 [00:00&lt;00:00, 20.28it/s]</div>
+Fetching 7 files:  71%|███████▏  | 5/7 [00:00&lt;00:00, 15.31it/s]
+Fetching 7 files: 100%|██████████| 7/7 [00:00&lt;00:00, 21.41it/s]</div>
 <div class="cell-artifacts">
 <h4>Artifacts:</h4>
 <a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
diff --git a/activation/impls/torch_swiglu.html b/activation/impls/torch_swiglu.html
index 215b2799716ac41798e6372ba0e150a2bd6bd9c0..41f6e46a2626019e3e97d61016b7b71b844385d6 100644
--- a/activation/impls/torch_swiglu.html
+++ b/activation/impls/torch_swiglu.html
@@ -4106,11 +4106,11 @@ body[data-tool="eraser"] .main-content {
 <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> | 
-Cell: nv | 0.23s
+Cell: nv | 0.26s
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
 <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
-<a href="https://github.com/huggingface/kernels-uvnotes/blob/main/activation/impls/torch_swiglu.md" target="_blank" class="github-btn">GitHub</a>
+<a href="https://github.com/huggingface/kernels-benchmarks/blob/main/activation/impls/torch_swiglu.md" target="_blank" class="github-btn">GitHub</a>
 </div>
 <div id="code-nv" class="cell-code" data-lines="2">
 <div class="code-wrap">
@@ -4122,7 +4122,7 @@ Cell: nv | 0.23s
 </div>
 </div>
 <div id="output-nv" class="cell-output">
-<div class="cell-stdout"><pre class="stdout-text">Thu Oct 30 15:52:16 2025       
+<div class="cell-stdout"><pre class="stdout-text">Fri Oct 31 20:00:17 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -4131,7 +4131,7 @@ Cell: nv | 0.23s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   29C    P0             86W /  350W |       0MiB /  46068MiB |      0%      Default |
+| N/A   33C    P0            108W /  350W |       0MiB /  46068MiB |     88%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -4155,11 +4155,11 @@ Cell: nv | 0.23s
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
 </span> | 
-Cell: benchmark | 6.88s
+Cell: benchmark | 7.02s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
-<a href="https://github.com/huggingface/kernels-uvnotes/blob/main/activation/impls/torch_swiglu.md" target="_blank" class="github-btn">GitHub</a>
+<a href="https://github.com/huggingface/kernels-benchmarks/blob/main/activation/impls/torch_swiglu.md" target="_blank" class="github-btn">GitHub</a>
 </div>
 <div id="code-benchmark" class="cell-code" data-lines="28">
 <div class="code-wrap">
@@ -4205,20 +4205,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D768
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     179.327us      1411.47%     179.327us     179.327us             1  
-                                            torch_eager        11.22%     210.364us        99.57%       1.867ms       1.867ms       0.000us         0.00%      15.009us      15.009us             1  
-                                             aten::silu         3.37%      63.151us        82.30%       1.543ms     514.355us       6.497us        51.14%       8.801us       2.934us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.497us        51.14%       6.497us       2.166us             3  
-                                              aten::mul         1.76%      33.030us         2.90%      54.310us      18.103us       6.208us        48.86%       6.208us       2.069us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.208us        48.86%       6.208us       2.069us             3  
-                                Activity Buffer Request        76.72%       1.439ms        76.72%       1.439ms       1.439ms       2.304us        18.13%       2.304us       2.304us             1  
-                                            aten::slice         2.52%      47.241us         3.15%      59.052us       9.842us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.63%      11.811us         0.63%      11.811us       1.968us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         3.34%      62.690us         3.34%      62.690us      10.448us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.43%       8.120us         0.43%       8.120us       8.120us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     188.575us      1476.70%     188.575us     188.575us             1  
+                                            torch_eager        11.13%     210.826us        99.56%       1.887ms       1.887ms       0.000us         0.00%      15.106us      15.106us             1  
+                                             aten::silu         3.37%      63.781us        82.44%       1.562ms     520.736us       6.497us        50.88%       8.833us       2.944us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.497us        50.88%       6.497us       2.166us             3  
+                                              aten::mul         1.86%      35.170us         2.95%      55.841us      18.614us       6.273us        49.12%       6.273us       2.091us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.273us        49.12%       6.273us       2.091us             3  
+                                Activity Buffer Request        76.78%       1.455ms        76.78%       1.455ms       1.455ms       2.336us        18.29%       2.336us       2.336us             1  
+                                            aten::slice         2.45%      46.380us         3.05%      57.842us       9.640us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.60%      11.462us         0.60%      11.462us       1.910us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         3.38%      64.112us         3.38%      64.112us      10.685us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.44%       8.280us         0.44%       8.280us       8.280us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.875ms
-Self CUDA time total: 12.705us
+Self CPU time total: 1.895ms
+Self CUDA time total: 12.770us
 
 
 
@@ -4228,20 +4228,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     151.777us      1228.76%     151.777us     151.777us             1  
-                                            torch_eager         6.62%     113.831us        99.66%       1.713ms       1.713ms       0.000us         0.00%      14.496us      14.496us             1  
-                                             aten::silu         2.46%      42.260us        88.64%       1.523ms     507.722us       6.368us        51.55%       8.512us       2.837us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.368us        51.55%       6.368us       2.123us             3  
-                                              aten::mul         1.53%      26.241us         2.60%      44.713us      14.904us       5.984us        48.45%       5.984us       1.995us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.984us        48.45%       5.984us       1.995us             3  
-                                Activity Buffer Request        84.63%       1.454ms        84.63%       1.454ms       1.454ms       2.144us        17.36%       2.144us       2.144us             1  
-                                            aten::slice         1.45%      24.880us         1.80%      30.920us       5.153us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.35%       6.040us         0.35%       6.040us       1.007us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         2.62%      45.062us         2.62%      45.062us       7.510us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.34%       5.800us         0.34%       5.800us       5.800us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     152.926us      1234.87%     152.926us     152.926us             1  
+                                            torch_eager         6.55%     113.093us        99.67%       1.721ms       1.721ms       0.000us         0.00%      14.560us      14.560us             1  
+                                             aten::silu         2.40%      41.391us        88.69%       1.532ms     510.609us       6.400us        51.68%       8.576us       2.859us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.400us        51.68%       6.400us       2.133us             3  
+                                              aten::mul         1.50%      25.830us         2.63%      45.361us      15.120us       5.984us        48.32%       5.984us       1.995us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.984us        48.32%       5.984us       1.995us             3  
+                                Activity Buffer Request        84.72%       1.463ms        84.72%       1.463ms       1.463ms       2.176us        17.57%       2.176us       2.176us             1  
+                                            aten::slice         1.43%      24.741us         1.80%      31.062us       5.177us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.37%       6.321us         0.37%       6.321us       1.054us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         2.71%      46.721us         2.71%      46.721us       7.787us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.33%       5.741us         0.33%       5.741us       5.741us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.718ms
-Self CUDA time total: 12.352us
+Self CPU time total: 1.727ms
+Self CUDA time total: 12.384us
 
 
 
@@ -4251,20 +4251,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     151.422us      1145.66%     151.422us     151.422us             1  
-                                            torch_eager         6.39%     108.591us        99.69%       1.694ms       1.694ms       0.000us         0.00%      15.489us      15.489us             1  
-                                             aten::silu         2.42%      41.180us        88.84%       1.509ms     503.045us       6.784us        51.33%       9.056us       3.019us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.784us        51.33%       6.784us       2.261us             3  
-                                              aten::mul         1.56%      26.573us         2.72%      46.263us      15.421us       6.433us        48.67%       6.433us       2.144us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.433us        48.67%       6.433us       2.144us             3  
-                                Activity Buffer Request        84.90%       1.442ms        84.90%       1.442ms       1.442ms       2.272us        17.19%       2.272us       2.272us             1  
-                                            aten::slice         1.42%      24.110us         1.74%      29.570us       4.928us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.32%       5.460us         0.32%       5.460us       0.910us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         2.67%      45.420us         2.67%      45.420us       7.570us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.31%       5.240us         0.31%       5.240us       5.240us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     152.413us      1147.86%     152.413us     152.413us             1  
+                                            torch_eager         6.17%     105.134us        99.68%       1.699ms       1.699ms       0.000us         0.00%      15.581us      15.581us             1  
+                                             aten::silu         2.58%      43.990us        88.96%       1.517ms     505.533us       6.814us        51.32%       9.117us       3.039us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.814us        51.32%       6.814us       2.271us             3  
+                                              aten::mul         1.63%      27.711us         2.72%      46.371us      15.457us       6.464us        48.68%       6.464us       2.155us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.464us        48.68%       6.464us       2.155us             3  
+                                Activity Buffer Request        84.84%       1.446ms        84.84%       1.446ms       1.446ms       2.303us        17.34%       2.303us       2.303us             1  
+                                            aten::slice         1.47%      24.990us         1.83%      31.250us       5.208us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.37%       6.260us         0.37%       6.260us       1.043us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         2.63%      44.871us         2.63%      44.871us       7.478us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.32%       5.431us         0.32%       5.431us       5.431us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.699ms
-Self CUDA time total: 13.217us
+Self CPU time total: 1.705ms
+Self CUDA time total: 13.278us
 
 
 
@@ -4274,20 +4274,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D768
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     152.159us      1197.73%     152.159us     152.159us             1  
-                                            torch_eager         7.49%     109.251us        99.65%       1.454ms       1.454ms       0.000us         0.00%      14.912us      14.912us             1  
-                                             aten::silu         2.87%      41.871us        86.91%       1.268ms     422.724us       6.560us        51.64%       8.768us       2.923us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.560us        51.64%       6.560us       2.187us             3  
-                                              aten::mul         1.82%      26.542us         3.09%      45.132us      15.044us       6.144us        48.36%       6.144us       2.048us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.144us        48.36%       6.144us       2.048us             3  
-                                Activity Buffer Request        71.19%       1.039ms        71.19%       1.039ms       1.039ms       2.208us        17.38%       2.208us       2.208us             1  
-                                            aten::slice         1.75%      25.480us         2.16%      31.560us       5.260us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.42%       6.080us         0.42%       6.080us       1.013us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        14.12%     206.043us        14.12%     206.043us      34.340us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.35%       5.050us         0.35%       5.050us       5.050us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     155.359us      1219.84%     155.359us     155.359us             1  
+                                            torch_eager         6.31%     109.593us        99.71%       1.733ms       1.733ms       0.000us         0.00%      14.944us      14.944us             1  
+                                             aten::silu         2.48%      43.021us        88.93%       1.545ms     515.160us       6.560us        51.51%       8.768us       2.923us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.560us        51.51%       6.560us       2.187us             3  
+                                              aten::mul         1.62%      28.091us         2.66%      46.261us      15.420us       6.176us        48.49%       6.176us       2.059us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.176us        48.49%       6.176us       2.059us             3  
+                                Activity Buffer Request        74.70%       1.298ms        74.70%       1.298ms       1.298ms       2.208us        17.34%       2.208us       2.208us             1  
+                                            aten::slice         1.46%      25.370us         1.82%      31.631us       5.272us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.36%       6.261us         0.36%       6.261us       1.043us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        12.80%     222.405us        12.80%     222.405us      37.068us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.29%       4.960us         0.29%       4.960us       4.960us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.459ms
-Self CUDA time total: 12.704us
+Self CPU time total: 1.738ms
+Self CUDA time total: 12.736us
 
 
 
@@ -4297,20 +4297,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     147.295us      1117.14%     147.295us     147.295us             1  
-                                            torch_eager         5.91%     105.630us        99.72%       1.782ms       1.782ms       0.000us         0.00%      15.457us      15.457us             1  
-                                             aten::silu         2.35%      41.900us        89.64%       1.602ms     533.846us       6.752us        51.21%       9.024us       3.008us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.752us        51.21%       6.752us       2.251us             3  
-                                              aten::mul         1.43%      25.502us         2.46%      43.882us      14.627us       6.433us        48.79%       6.433us       2.144us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.433us        48.79%       6.433us       2.144us             3  
-                                Activity Buffer Request        78.53%       1.403ms        78.53%       1.403ms       1.403ms       2.272us        17.23%       2.272us       2.272us             1  
-                                            aten::slice         1.39%      24.781us         1.71%      30.582us       5.097us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.32%       5.801us         0.32%       5.801us       0.967us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         9.80%     175.053us         9.80%     175.053us      29.176us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.28%       4.969us         0.28%       4.969us       4.969us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     153.122us      1152.94%     153.122us     153.122us             1  
+                                            torch_eager         5.95%     108.905us        99.72%       1.827ms       1.827ms       0.000us         0.00%      15.585us      15.585us             1  
+                                             aten::silu         2.26%      41.441us        89.57%       1.641ms     546.874us       6.816us        51.32%       9.120us       3.040us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.816us        51.32%       6.816us       2.272us             3  
+                                              aten::mul         1.45%      26.581us         2.47%      45.261us      15.087us       6.465us        48.68%       6.465us       2.155us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.465us        48.68%       6.465us       2.155us             3  
+                                Activity Buffer Request        78.54%       1.439ms        78.54%       1.439ms       1.439ms       2.304us        17.35%       2.304us       2.304us             1  
+                                            aten::slice         1.41%      25.869us         1.74%      31.870us       5.312us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.33%       6.001us         0.33%       6.001us       1.000us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         9.78%     179.164us         9.78%     179.164us      29.861us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.28%       5.090us         0.28%       5.090us       5.090us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.787ms
-Self CUDA time total: 13.185us
+Self CPU time total: 1.832ms
+Self CUDA time total: 13.281us
 
 
 
@@ -4320,20 +4320,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     143.964us       937.33%     143.964us     143.964us             1  
-                                            torch_eager        21.41%     103.402us        98.95%     477.918us     477.918us       0.000us         0.00%      18.047us      18.047us             1  
-                                             aten::silu         9.04%      43.640us        62.61%     302.394us     100.798us       7.872us        51.25%      10.560us       3.520us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.872us        51.25%       7.872us       2.624us             3  
-                                              aten::mul         5.13%      24.761us         8.85%      42.722us      14.241us       7.487us        48.75%       7.487us       2.496us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.487us        48.75%       7.487us       2.496us             3  
-                                Activity Buffer Request        22.09%     106.692us        22.09%     106.692us     106.692us       2.688us        17.50%       2.688us       2.688us             1  
-                                            aten::slice         4.94%      23.880us         6.09%      29.400us       4.900us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         1.14%       5.520us         1.14%       5.520us       0.920us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        35.20%     170.023us        35.20%     170.023us      28.337us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         1.05%       5.060us         1.05%       5.060us       5.060us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     150.877us       970.08%     150.877us     150.877us             1  
+                                            torch_eager        20.61%     104.763us        99.03%     503.283us     503.283us       0.000us         0.00%      18.241us      18.241us             1  
+                                             aten::silu         8.60%      43.701us        63.19%     321.148us     107.049us       7.969us        51.24%      10.657us       3.552us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.969us        51.24%       7.969us       2.656us             3  
+                                              aten::mul         5.45%      27.720us         8.99%      45.690us      15.230us       7.584us        48.76%       7.584us       2.528us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.584us        48.76%       7.584us       2.528us             3  
+                                Activity Buffer Request        24.24%     123.213us        24.24%     123.213us     123.213us       2.688us        17.28%       2.688us       2.688us             1  
+                                            aten::slice         5.04%      25.603us         6.23%      31.682us       5.280us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         1.20%       6.079us         1.20%       6.079us       1.013us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        33.88%     172.204us        33.88%     172.204us      28.701us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.97%       4.940us         0.97%       4.940us       4.940us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 482.978us
-Self CUDA time total: 15.359us
+Self CPU time total: 508.223us
+Self CUDA time total: 15.553us
 
 
 
@@ -4343,20 +4343,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D768
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     154.301us      1078.65%     154.301us     154.301us             1  
-                                            torch_eager         5.96%     107.399us        99.74%       1.796ms       1.796ms       0.000us         0.00%      16.769us      16.769us             1  
-                                             aten::silu         2.38%      42.931us        89.51%       1.612ms     537.266us       7.328us        51.23%       9.792us       3.264us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.328us        51.23%       7.328us       2.443us             3  
-                                              aten::mul         1.49%      26.893us         2.55%      45.883us      15.294us       6.977us        48.77%       6.977us       2.326us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.977us        48.77%       6.977us       2.326us             3  
-                                Activity Buffer Request        78.67%       1.417ms        78.67%       1.417ms       1.417ms       2.464us        17.22%       2.464us       2.464us             1  
-                                            aten::slice         1.40%      25.140us         1.72%      31.031us       5.172us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.33%       5.891us         0.33%       5.891us       0.982us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         9.51%     171.283us         9.51%     171.283us      28.547us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.26%       4.600us         0.26%       4.600us       4.600us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     156.541us      1089.44%     156.541us     156.541us             1  
+                                            torch_eager         6.81%     125.673us        99.72%       1.840ms       1.840ms       0.000us         0.00%      16.866us      16.866us             1  
+                                             aten::silu         2.28%      42.101us        88.57%       1.634ms     544.654us       7.361us        51.23%       9.858us       3.286us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.361us        51.23%       7.361us       2.454us             3  
+                                              aten::mul         1.53%      28.200us         2.53%      46.622us      15.541us       7.008us        48.77%       7.008us       2.336us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.008us        48.77%       7.008us       2.336us             3  
+                                Activity Buffer Request        77.96%       1.438ms        77.96%       1.438ms       1.438ms       2.497us        17.38%       2.497us       2.497us             1  
+                                            aten::slice         1.46%      26.979us         1.81%      33.310us       5.552us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.34%       6.331us         0.34%       6.331us       1.055us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         9.33%     172.076us         9.33%     172.076us      28.679us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.28%       5.210us         0.28%       5.210us       5.210us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.801ms
-Self CUDA time total: 14.305us
+Self CPU time total: 1.845ms
+Self CUDA time total: 14.369us
 
 
 
@@ -4366,20 +4366,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D1024
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     154.686us      1002.89%     154.686us     154.686us             1  
-                                            torch_eager        22.31%     107.382us        99.03%     476.668us     476.668us       0.000us         0.00%      18.080us      18.080us             1  
-                                             aten::silu         9.43%      45.390us        60.13%     289.404us      96.468us       7.872us        51.04%      10.528us       3.509us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.872us        51.04%       7.872us       2.624us             3  
-                                              aten::mul         6.54%      31.461us        10.39%      50.022us      16.674us       7.552us        48.96%       7.552us       2.517us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.552us        48.96%       7.552us       2.517us             3  
-                                Activity Buffer Request        19.41%      93.401us        19.41%      93.401us      93.401us       2.656us        17.22%       2.656us       2.656us             1  
-                                            aten::slice         5.01%      24.090us         6.20%      29.860us       4.977us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         1.20%       5.770us         1.20%       5.770us       0.962us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        35.15%     169.174us        35.15%     169.174us      28.196us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.97%       4.650us         0.97%       4.650us       4.650us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     149.754us       962.92%     149.754us     149.754us             1  
+                                            torch_eager        21.77%     106.163us        98.85%     481.952us     481.952us       0.000us         0.00%      18.240us      18.240us             1  
+                                             aten::silu         8.65%      42.151us        61.90%     301.788us     100.596us       7.968us        51.23%      10.656us       3.552us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.968us        51.23%       7.968us       2.656us             3  
+                                              aten::mul         5.09%      24.801us         8.77%      42.752us      14.251us       7.584us        48.77%       7.584us       2.528us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.584us        48.77%       7.584us       2.528us             3  
+                                Activity Buffer Request        21.73%     105.953us        21.73%     105.953us     105.953us       2.688us        17.28%       2.688us       2.688us             1  
+                                            aten::slice         5.14%      25.050us         6.41%      31.249us       5.208us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         1.27%       6.199us         1.27%       6.199us       1.033us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        35.20%     171.635us        35.20%     171.635us      28.606us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         1.15%       5.600us         1.15%       5.600us       5.600us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 481.318us
-Self CUDA time total: 15.424us
+Self CPU time total: 487.552us
+Self CUDA time total: 15.552us
 
 
 
@@ -4389,20 +4389,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D2048
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     155.678us       692.09%     155.678us     155.678us             1  
-                                            torch_eager         6.04%     109.222us        99.73%       1.805ms       1.805ms       0.000us         0.00%      26.365us      26.365us             1  
-                                             aten::silu         2.28%      41.351us        89.49%       1.620ms     539.866us      11.614us        51.63%      15.485us       5.162us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      11.614us        51.63%      11.614us       3.871us             3  
-                                              aten::mul         1.47%      26.681us         2.47%      44.641us      14.880us      10.880us        48.37%      10.880us       3.627us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.880us        48.37%      10.880us       3.627us             3  
-                                Activity Buffer Request        78.73%       1.425ms        78.73%       1.425ms       1.425ms       3.871us        17.21%       3.871us       3.871us             1  
-                                            aten::slice         1.39%      25.188us         1.73%      31.390us       5.232us       0.000us         0.00%       0.000us       0.000us             6  
-                                       aten::as_strided         0.34%       6.202us         0.34%       6.202us       1.034us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         9.47%     171.352us         9.47%     171.352us      28.559us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.27%       4.900us         0.27%       4.900us       4.900us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     187.357us       834.00%     187.357us     187.357us             1  
+                                            torch_eager         6.93%     128.860us        99.74%       1.856ms       1.856ms       0.000us         0.00%      26.369us      26.369us             1  
+                                             aten::silu         2.32%      43.123us        88.23%       1.642ms     547.175us      11.616us        51.71%      15.520us       5.173us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      11.616us        51.71%      11.616us       3.872us             3  
+                                              aten::mul         1.63%      30.312us         2.74%      50.922us      16.974us      10.849us        48.29%      10.849us       3.616us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.849us        48.29%      10.849us       3.616us             3  
+                                Activity Buffer Request        77.79%       1.447ms        77.79%       1.447ms       1.447ms       3.904us        17.38%       3.904us       3.904us             1  
+                                            aten::slice         1.49%      27.691us         1.84%      34.251us       5.708us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.35%       6.560us         0.35%       6.560us       1.093us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         9.23%     171.734us         9.23%     171.734us      28.622us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.26%       4.930us         0.26%       4.930us       4.930us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.810ms
-Self CUDA time total: 22.494us
+Self CPU time total: 1.860ms
+Self CUDA time total: 22.465us
 
 
 impl                     wl                  p50(ms)  ok
@@ -4419,7 +4419,7 @@ torch_eager              cuda_T512_D768         0.05  True
 <div class="uv-install-logs" id="uv-logs-benchmark">
 <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 <div class="uv-logs-content" style="display: none;">
-Installed 37 packages in 216ms
+Installed 37 packages in 251ms
 </div>
 </div>
 <div class="cell-artifacts">
diff --git a/activation/results/artifacts/combine/latency.svg b/activation/results/artifacts/combine/latency.svg
index 961d35bc69df12d3f8c1e9441cc14de8f19fb723..c90094a9212ed4b3ea466620aa29c029e98de04f 100644
--- a/activation/results/artifacts/combine/latency.svg
+++ b/activation/results/artifacts/combine/latency.svg
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:49127439c8b28e18efed1525d57e9bb48bdb632034f2f84a60940f7d447aff24
-size 20647
+oid sha256:085b4a64bddea2955d6d074836121ec2e120fb1ca9140f3ccb75e8358e4526b3
+size 20644
diff --git a/activation/results/combined_results.html b/activation/results/combined_results.html
index 2ed3e05955eb7f6d843de731dbef9c8c20788b83..aefcf7c048ef413bda722db3be44aa8b9b9cef43 100644
--- a/activation/results/combined_results.html
+++ b/activation/results/combined_results.html
@@ -4107,7 +4107,7 @@ body[data-tool="eraser"] .main-content {
   <rdf:RDF>
    <ns2:Work>
     <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
-    <dc:date>2025-10-30T15:53:40.869549</dc:date>
+    <dc:date>2025-10-31T20:14:01.265668</dc:date>
     <dc:format>image/svg+xml</dc:format>
     <dc:creator>
      <ns2:Agent>
@@ -4256,83 +4256,83 @@ body[data-tool="eraser"] .main-content {
    <g id="matplotlib.axis_2">
     <g id="ytick_1">
      <g id="grid-y--2" class="grid grid-y">
-      <path d="M 60.23 430.151687  L 847.294169 430.151687  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 60.23 447.291581  L 847.294169 447.291581  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_10">
       <defs>
        <path id="m0fca2865ba" d="M 0 0  L -3.5 0  " style="stroke: #000000; stroke-width: 0.8" />
       </defs>
       <g>
-       <use ns4:href="#m0fca2865ba" x="60.23" y="430.151687" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="60.23" y="447.291581" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_10">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="433.950906" transform="rotate(-0 53.23 433.950906)">0.025</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="451.090799" transform="rotate(-0 53.23 451.090799)">0.025</text>
      </g>
     </g>
     <g id="ytick_2">
      <g id="grid-y--3" class="grid grid-y">
-      <path d="M 60.23 360.098012  L 847.294169 360.098012  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 60.23 372.461283  L 847.294169 372.461283  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_11">
       <g>
-       <use ns4:href="#m0fca2865ba" x="60.23" y="360.098012" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="60.23" y="372.461283" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_11">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="363.897231" transform="rotate(-0 53.23 363.897231)">0.030</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="376.260501" transform="rotate(-0 53.23 376.260501)">0.030</text>
      </g>
     </g>
     <g id="ytick_3">
      <g id="grid-y--4" class="grid grid-y">
-      <path d="M 60.23 290.044337  L 847.294169 290.044337  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 60.23 297.630984  L 847.294169 297.630984  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_12">
       <g>
-       <use ns4:href="#m0fca2865ba" x="60.23" y="290.044337" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="60.23" y="297.630984" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_12">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="293.843555" transform="rotate(-0 53.23 293.843555)">0.035</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="301.430203" transform="rotate(-0 53.23 301.430203)">0.035</text>
      </g>
     </g>
     <g id="ytick_4">
      <g id="grid-y--5" class="grid grid-y">
-      <path d="M 60.23 219.990661  L 847.294169 219.990661  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 60.23 222.800686  L 847.294169 222.800686  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_13">
       <g>
-       <use ns4:href="#m0fca2865ba" x="60.23" y="219.990661" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="60.23" y="222.800686" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_13">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="223.78988" transform="rotate(-0 53.23 223.78988)">0.040</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="226.599905" transform="rotate(-0 53.23 226.599905)">0.040</text>
      </g>
     </g>
     <g id="ytick_5">
      <g id="grid-y--6" class="grid grid-y">
-      <path d="M 60.23 149.936986  L 847.294169 149.936986  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 60.23 147.970388  L 847.294169 147.970388  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_14">
       <g>
-       <use ns4:href="#m0fca2865ba" x="60.23" y="149.936986" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="60.23" y="147.970388" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_14">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="153.736205" transform="rotate(-0 53.23 153.736205)">0.045</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="151.769607" transform="rotate(-0 53.23 151.769607)">0.045</text>
      </g>
     </g>
     <g id="ytick_6">
      <g id="grid-y--7" class="grid grid-y">
-      <path d="M 60.23 79.883311  L 847.294169 79.883311  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 60.23 73.14009  L 847.294169 73.14009  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_15">
       <g>
-       <use ns4:href="#m0fca2865ba" x="60.23" y="79.883311" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="60.23" y="73.14009" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_15">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="83.682529" transform="rotate(-0 53.23 83.682529)">0.050</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="76.939309" transform="rotate(-0 53.23 76.939309)">0.050</text>
      </g>
     </g>
     <g id="label--y" class="ylabel">
@@ -4340,37 +4340,37 @@ body[data-tool="eraser"] .main-content {
     </g>
    </g>
    <g id="series--hf-kernels-swiglu" class="series">
-    <path d="M 96.005644 451.16779  L 185.444754 372.273341  L 274.883864 372.427459  L 364.322974 385.723647  L 453.762084 379.572934  L 543.201194 400.294811  L 632.640304 395.405064  L 722.079415 397.492664  L 811.518525 390.066975  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 96.005644 451.16779  L 185.444754 376.487152  L 274.883864 390.555248  L 364.322974 389.208303  L 453.762084 410.624734  L 543.201194 412.405695  L 632.640304 383.371541  L 722.079415 400.283188  L 811.518525 398.936242  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="md7efaf3aec" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #1f77b4" />
     </defs>
     <g clip-path="url(#p620c7d392f)">
      <use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="185.444754" y="372.273341" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="274.883864" y="372.427459" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="364.322974" y="385.723647" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="453.762084" y="379.572934" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="543.201194" y="400.294811" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="632.640304" y="395.405064" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="722.079415" y="397.492664" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="811.518525" y="390.066975" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="185.444754" y="376.487152" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="274.883864" y="390.555248" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="364.322974" y="389.208303" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="453.762084" y="410.624734" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="543.201194" y="412.405695" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="632.640304" y="383.371541" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="722.079415" y="400.283188" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="811.518525" y="398.936242" style="fill: #1f77b4; stroke: #1f77b4" />
     </g>
    </g>
    <g id="series--torch-eager" class="series">
-    <path d="M 96.005644 191.815073  L 185.444754 47.08418  L 274.883864 72.023288  L 364.322974 89.116386  L 453.762084 69.781571  L 543.201194 83.10578  L 632.640304 73.424362  L 722.079415 89.116385  L 811.518525 103.687549  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 96.005644 155.288791  L 185.444754 47.08418  L 274.883864 47.967177  L 364.322974 65.193113  L 453.762084 62.798543  L 543.201194 92.28168  L 632.640304 73.424445  L 722.079415 89.138808  L 811.518525 87.342881  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="m9b8c54d372" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #ff7f0e" />
     </defs>
     <g clip-path="url(#p620c7d392f)">
-     <use ns4:href="#m9b8c54d372" x="96.005644" y="191.815073" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="96.005644" y="155.288791" style="fill: #ff7f0e; stroke: #ff7f0e" />
      <use ns4:href="#m9b8c54d372" x="185.444754" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="274.883864" y="72.023288" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="364.322974" y="89.116386" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="453.762084" y="69.781571" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="543.201194" y="83.10578" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="632.640304" y="73.424362" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="722.079415" y="89.116385" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="811.518525" y="103.687549" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="274.883864" y="47.967177" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="364.322974" y="65.193113" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="453.762084" y="62.798543" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="543.201194" y="92.28168" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="632.640304" y="73.424445" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="722.079415" y="89.138808" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="811.518525" y="87.342881" style="fill: #ff7f0e; stroke: #ff7f0e" />
     </g>
    </g>
    <g id="patch_3">
@@ -4428,7 +4428,7 @@ body[data-tool="eraser"] .main-content {
 <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
 </span> | 
-Cell: combine | 4.28s
+Cell: combine | 4.32s
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
 <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -4554,7 +4554,7 @@ Implementations included:
 <div class="uv-install-logs" id="uv-logs-combine">
 <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 <div class="uv-logs-content" style="display: none;">
-Installed 37 packages in 222ms
+Installed 37 packages in 213ms
 </div>
 </div>
 <div class="cell-artifacts">
@@ -4567,7 +4567,7 @@ Installed 37 packages in 222ms
   <rdf:RDF>
    <ns2:Work>
     <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
-    <dc:date>2025-10-30T15:53:40.869549</dc:date>
+    <dc:date>2025-10-31T20:14:01.265668</dc:date>
     <dc:format>image/svg+xml</dc:format>
     <dc:creator>
      <ns2:Agent>
@@ -4716,83 +4716,83 @@ Installed 37 packages in 222ms
    <g id="matplotlib.axis_2">
     <g id="ytick_1">
      <g id="grid-y--2" class="grid grid-y">
-      <path d="M 60.23 430.151687  L 847.294169 430.151687  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 60.23 447.291581  L 847.294169 447.291581  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_10">
       <defs>
        <path id="m0fca2865ba" d="M 0 0  L -3.5 0  " style="stroke: #000000; stroke-width: 0.8" />
       </defs>
       <g>
-       <use ns4:href="#m0fca2865ba" x="60.23" y="430.151687" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="60.23" y="447.291581" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_10">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="433.950906" transform="rotate(-0 53.23 433.950906)">0.025</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="451.090799" transform="rotate(-0 53.23 451.090799)">0.025</text>
      </g>
     </g>
     <g id="ytick_2">
      <g id="grid-y--3" class="grid grid-y">
-      <path d="M 60.23 360.098012  L 847.294169 360.098012  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 60.23 372.461283  L 847.294169 372.461283  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_11">
       <g>
-       <use ns4:href="#m0fca2865ba" x="60.23" y="360.098012" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="60.23" y="372.461283" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_11">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="363.897231" transform="rotate(-0 53.23 363.897231)">0.030</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="376.260501" transform="rotate(-0 53.23 376.260501)">0.030</text>
      </g>
     </g>
     <g id="ytick_3">
      <g id="grid-y--4" class="grid grid-y">
-      <path d="M 60.23 290.044337  L 847.294169 290.044337  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 60.23 297.630984  L 847.294169 297.630984  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_12">
       <g>
-       <use ns4:href="#m0fca2865ba" x="60.23" y="290.044337" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="60.23" y="297.630984" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_12">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="293.843555" transform="rotate(-0 53.23 293.843555)">0.035</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="301.430203" transform="rotate(-0 53.23 301.430203)">0.035</text>
      </g>
     </g>
     <g id="ytick_4">
      <g id="grid-y--5" class="grid grid-y">
-      <path d="M 60.23 219.990661  L 847.294169 219.990661  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 60.23 222.800686  L 847.294169 222.800686  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_13">
       <g>
-       <use ns4:href="#m0fca2865ba" x="60.23" y="219.990661" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="60.23" y="222.800686" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_13">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="223.78988" transform="rotate(-0 53.23 223.78988)">0.040</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="226.599905" transform="rotate(-0 53.23 226.599905)">0.040</text>
      </g>
     </g>
     <g id="ytick_5">
      <g id="grid-y--6" class="grid grid-y">
-      <path d="M 60.23 149.936986  L 847.294169 149.936986  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 60.23 147.970388  L 847.294169 147.970388  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_14">
       <g>
-       <use ns4:href="#m0fca2865ba" x="60.23" y="149.936986" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="60.23" y="147.970388" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_14">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="153.736205" transform="rotate(-0 53.23 153.736205)">0.045</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="151.769607" transform="rotate(-0 53.23 151.769607)">0.045</text>
      </g>
     </g>
     <g id="ytick_6">
      <g id="grid-y--7" class="grid grid-y">
-      <path d="M 60.23 79.883311  L 847.294169 79.883311  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 60.23 73.14009  L 847.294169 73.14009  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_15">
       <g>
-       <use ns4:href="#m0fca2865ba" x="60.23" y="79.883311" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="60.23" y="73.14009" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_15">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="83.682529" transform="rotate(-0 53.23 83.682529)">0.050</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="76.939309" transform="rotate(-0 53.23 76.939309)">0.050</text>
      </g>
     </g>
     <g id="label--y" class="ylabel">
@@ -4800,37 +4800,37 @@ Installed 37 packages in 222ms
     </g>
    </g>
    <g id="series--hf-kernels-swiglu" class="series">
-    <path d="M 96.005644 451.16779  L 185.444754 372.273341  L 274.883864 372.427459  L 364.322974 385.723647  L 453.762084 379.572934  L 543.201194 400.294811  L 632.640304 395.405064  L 722.079415 397.492664  L 811.518525 390.066975  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 96.005644 451.16779  L 185.444754 376.487152  L 274.883864 390.555248  L 364.322974 389.208303  L 453.762084 410.624734  L 543.201194 412.405695  L 632.640304 383.371541  L 722.079415 400.283188  L 811.518525 398.936242  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="md7efaf3aec" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #1f77b4" />
     </defs>
     <g clip-path="url(#p620c7d392f)">
      <use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="185.444754" y="372.273341" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="274.883864" y="372.427459" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="364.322974" y="385.723647" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="453.762084" y="379.572934" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="543.201194" y="400.294811" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="632.640304" y="395.405064" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="722.079415" y="397.492664" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="811.518525" y="390.066975" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="185.444754" y="376.487152" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="274.883864" y="390.555248" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="364.322974" y="389.208303" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="453.762084" y="410.624734" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="543.201194" y="412.405695" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="632.640304" y="383.371541" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="722.079415" y="400.283188" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="811.518525" y="398.936242" style="fill: #1f77b4; stroke: #1f77b4" />
     </g>
    </g>
    <g id="series--torch-eager" class="series">
-    <path d="M 96.005644 191.815073  L 185.444754 47.08418  L 274.883864 72.023288  L 364.322974 89.116386  L 453.762084 69.781571  L 543.201194 83.10578  L 632.640304 73.424362  L 722.079415 89.116385  L 811.518525 103.687549  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 96.005644 155.288791  L 185.444754 47.08418  L 274.883864 47.967177  L 364.322974 65.193113  L 453.762084 62.798543  L 543.201194 92.28168  L 632.640304 73.424445  L 722.079415 89.138808  L 811.518525 87.342881  " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="m9b8c54d372" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #ff7f0e" />
     </defs>
     <g clip-path="url(#p620c7d392f)">
-     <use ns4:href="#m9b8c54d372" x="96.005644" y="191.815073" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="96.005644" y="155.288791" style="fill: #ff7f0e; stroke: #ff7f0e" />
      <use ns4:href="#m9b8c54d372" x="185.444754" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="274.883864" y="72.023288" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="364.322974" y="89.116386" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="453.762084" y="69.781571" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="543.201194" y="83.10578" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="632.640304" y="73.424362" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="722.079415" y="89.116385" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="811.518525" y="103.687549" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="274.883864" y="47.967177" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="364.322974" y="65.193113" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="453.762084" y="62.798543" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="543.201194" y="92.28168" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="632.640304" y="73.424445" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="722.079415" y="89.138808" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="811.518525" y="87.342881" style="fill: #ff7f0e; stroke: #ff7f0e" />
     </g>
    </g>
    <g id="patch_3">
diff --git a/causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl b/causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl
index 4dbcd737042ccd89af4999232ce91680c8569342..7bfddcfb2c66ba429fccc98758725309b85f6780 100644
--- a/causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl
+++ b/causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl
@@ -1,24 +1,24 @@
-{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S128_W2", "batch": 2, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04394999996293336, "p50": 0.04566100000147344, "p90": 0.046750000024076144, "mean": 0.04579239999884521, "iqr": 0.0020500000346146408, "raw_times": [0.0446999999894615, 0.047901000016281614, 0.046750000024076144, 0.04566100000147344, 0.04394999996293336], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05609099997627709, "peak_bytes": 295936, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S128_W4", "batch": 2, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05193099997313766, "p50": 0.05449100001442275, "p90": 0.054510999973444996, "mean": 0.05559319998837964, "iqr": 0.0010200000133409048, "raw_times": [0.05349099996010409, 0.05449100001442275, 0.06354200002078869, 0.05193099997313766, 0.054510999973444996], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.060221000012461445, "peak_bytes": 296448, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S512_W2", "batch": 2, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.051560999963840004, "p50": 0.05184100001542902, "p90": 0.05310099999178419, "mean": 0.05230499999697713, "iqr": 0.0014099999816608033, "raw_times": [0.05184100001542902, 0.05333100000370905, 0.05310099999178419, 0.05169100001012339, 0.051560999963840004], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.058330999991085264, "peak_bytes": 1180672, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S512_W4", "batch": 2, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05121099997040801, "p50": 0.051831000007496186, "p90": 0.052310999990368146, "mean": 0.05185479999454401, "iqr": 0.0008799999591246888, "raw_times": [0.05121099997040801, 0.051831000007496186, 0.052310999990368146, 0.05248999997320425, 0.05143100003124346], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05627100000538121, "peak_bytes": 1181184, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S2048_W2", "batch": 2, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.050751000003401714, "p50": 0.051640999970459234, "p90": 0.05217000000357075, "mean": 0.05161080000561924, "iqr": 0.0008689999617672584, "raw_times": [0.05219100000886101, 0.05217000000357075, 0.050751000003401714, 0.05130100004180349, 0.051640999970459234], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.055421000013211597, "peak_bytes": 4719616, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S2048_W4", "batch": 2, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04883100001507046, "p50": 0.049950999994052836, "p90": 0.05039000001261229, "mean": 0.04992260001017712, "iqr": 0.0006600000119760807, "raw_times": [0.04883100001507046, 0.05071100002851381, 0.04973000000063621, 0.05039000001261229, 0.049950999994052836], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05684100000280523, "peak_bytes": 4720128, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S128_W2", "batch": 2, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04895099999657759, "p50": 0.050181000005977694, "p90": 0.05176100000880979, "mean": 0.05066500000339147, "iqr": 0.0021600000081889448, "raw_times": [0.04960100000062084, 0.05176100000880979, 0.050181000005977694, 0.05283100000497143, 0.04895099999657759], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05629100002124687, "peak_bytes": 9461760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S128_W4", "batch": 2, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.048800999991271965, "p50": 0.051240999994206504, "p90": 0.0513809999915793, "mean": 0.05085500000632237, "iqr": 0.00043999995114063495, "raw_times": [0.051240999994206504, 0.048800999991271965, 0.051911000014115416, 0.050941000040438666, 0.0513809999915793], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056131000008008414, "peak_bytes": 9478144, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S512_W2", "batch": 2, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04940100001249448, "p50": 0.05085099996904319, "p90": 0.05221100002472667, "mean": 0.05112659999895186, "iqr": 0.0015410000742122065, "raw_times": [0.050669999950514466, 0.05221100002472667, 0.04940100001249448, 0.0525000000379805, 0.05085099996904319], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.053861000026245165, "peak_bytes": 37773312, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S512_W4", "batch": 2, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04947999997284569, "p50": 0.05073100004437947, "p90": 0.05098100001532657, "mean": 0.05063280001422754, "iqr": 0.0010900000120273035, "raw_times": [0.04947999997284569, 0.05098100001532657, 0.04989100000329927, 0.05073100004437947, 0.052081000035286706], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.054841000007854745, "peak_bytes": 37789696, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S2048_W2", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05039100000203689, "p50": 0.051160999987587275, "p90": 0.05154000001539316, "mean": 0.051364599994485616, "iqr": 0.00038000001723048626, "raw_times": [0.051160999987587275, 0.05257099996924808, 0.05039100000203689, 0.05154000001539316, 0.051159999998162675], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05513099995368975, "peak_bytes": 151019520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S2048_W4", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.048071000037452905, "p50": 0.05178100002467545, "p90": 0.0526809999996658, "mean": 0.05150900001353875, "iqr": 0.0032599999713056604, "raw_times": [0.04942100002836014, 0.0526809999996658, 0.05178100002467545, 0.05559099997753947, 0.048071000037452905], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05527100000790597, "peak_bytes": 151035904, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S128_W2", "batch": 4, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04963099996757592, "p50": 0.05203099999562255, "p90": 0.052549999963957816, "mean": 0.05276679999042244, "iqr": 0.0005189999683352653, "raw_times": [0.05759100002933337, 0.05203099999562255, 0.052549999963957816, 0.04963099996757592, 0.05203099999562255], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.07661199998665325, "peak_bytes": 33727488, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S128_W4", "batch": 4, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.049701000023105735, "p50": 0.051581000036549085, "p90": 0.05290100000365783, "mean": 0.05255880001868718, "iqr": 0.002381000001605571, "raw_times": [0.05290100000365783, 0.058091000028070994, 0.051581000036549085, 0.05052000000205226, 0.049701000023105735], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.054920000025049376, "peak_bytes": 591360, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S512_W2", "batch": 4, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0517009999612128, "p50": 0.05219999997052582, "p90": 0.05233100000623381, "mean": 0.05215079999061345, "iqr": 0.0001500000053056283, "raw_times": [0.05233100000623381, 0.05234100001416664, 0.05219999997052582, 0.0517009999612128, 0.05218100000092818], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.055141000018466, "peak_bytes": 2360320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S512_W4", "batch": 4, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05047100000865612, "p50": 0.05349100001694751, "p90": 0.05691100000149163, "mean": 0.057148999997025385, "iqr": 0.004350000040176383, "raw_times": [0.05047100000865612, 0.05349100001694751, 0.07231099999671642, 0.05256099996131525, 0.05691100000149163], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05554099999471873, "peak_bytes": 2360832, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S2048_W2", "batch": 4, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.049690000025748304, "p50": 0.050921000024573004, "p90": 0.051730999985011294, "mean": 0.051232800001344, "iqr": 0.0010800000040944724, "raw_times": [0.05065099998091682, 0.051730999985011294, 0.05317099999047059, 0.049690000025748304, 0.050921000024573004], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05373099997996178, "peak_bytes": 9438208, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S2048_W4", "batch": 4, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05013100002315696, "p50": 0.05073099998753605, "p90": 0.052470999946763186, "mean": 0.051448999988679134, "iqr": 0.001829999973779195, "raw_times": [0.05013100002315696, 0.05073099998753605, 0.05327100001295548, 0.052470999946763186, 0.05064099997298399], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05419999996547631, "peak_bytes": 9438720, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S128_W2", "batch": 4, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04918100000850245, "p50": 0.050670999996782484, "p90": 0.05192099996520483, "mean": 0.050938799995492445, "iqr": 0.0013709999393540784, "raw_times": [0.04918100000850245, 0.05192099996520483, 0.05237099998112171, 0.05055000002585075, 0.050670999996782484], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05559099997753947, "peak_bytes": 18931712, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S128_W4", "batch": 4, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.049871000044277025, "p50": 0.05047100000865612, "p90": 0.05118100000345294, "mean": 0.050820800015571876, "iqr": 0.0007699999855503847, "raw_times": [0.049871000044277025, 0.05041100001790255, 0.05217000000357075, 0.05047100000865612, 0.05118100000345294], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05564100001720362, "peak_bytes": 18948096, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S512_W2", "batch": 4, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05115100003649786, "p50": 0.052071000027353875, "p90": 0.05212100001017461, "mean": 0.05199700001412566, "iqr": 0.0006100000291553442, "raw_times": [0.05115100003649786, 0.052071000027353875, 0.053131000015582686, 0.05212100001017461, 0.05151099998101927], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05440099999987069, "peak_bytes": 75522048, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S512_W4", "batch": 4, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04927099996621109, "p50": 0.051500999973086437, "p90": 0.05194099998107049, "mean": 0.05114499998626343, "iqr": 0.000919999990856013, "raw_times": [0.051500999973086437, 0.04927099996621109, 0.051991000020734646, 0.05194099998107049, 0.05102099999021448], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.054591000036907644, "peak_bytes": 75538432, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S2048_W2", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.049930999978187174, "p50": 0.050361000035081815, "p90": 0.05102099999021448, "mean": 0.05066480000550655, "iqr": 0.0008009999987734773, "raw_times": [0.050219999991441, 0.050361000035081815, 0.05179100003260828, 0.049930999978187174, 0.05102099999021448], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05545099998016667, "peak_bytes": 302014464, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S2048_W4", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0500799999940682, "p50": 0.05195099998900332, "p90": 0.051991000020734646, "mean": 0.05318280000210507, "iqr": 0.0014600000213249587, "raw_times": [0.0500799999940682, 0.05195099998900332, 0.051991000020734646, 0.05053099999940969, 0.061361000007309485], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05489099999067548, "peak_bytes": 302030848, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S128_W2", "batch": 2, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.06906199996592477, "p50": 0.07093199997143529, "p90": 0.07169200000589626, "mean": 0.07107379998387842, "iqr": 0.0011000000199601345, "raw_times": [0.07093199997143529, 0.07309099999019963, 0.07059199998593613, 0.07169200000589626, 0.06906199996592477], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.07642200000645971, "peak_bytes": 295936, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S128_W4", "batch": 2, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08730199999718025, "p50": 0.08879199998546028, "p90": 0.08886199998414668, "mean": 0.0890762000040013, "iqr": 0.00037899997096246807, "raw_times": [0.08730199999718025, 0.08879199998546028, 0.08848300001318421, 0.08886199998414668, 0.09194200004003505], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.091862999965997, "peak_bytes": 296448, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S512_W2", "batch": 2, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08465199999818651, "p50": 0.08821300002637145, "p90": 0.08871199997884105, "mean": 0.08770840000806857, "iqr": 0.0007599999776175537, "raw_times": [0.08465199999818651, 0.0879520000012235, 0.08821300002637145, 0.08901300003572032, 0.08871199997884105], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09156300001222917, "peak_bytes": 1180672, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S512_W4", "batch": 2, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08501199999955134, "p50": 0.08710200000905388, "p90": 0.08719199996676252, "mean": 0.08665020000080403, "iqr": 0.001349999934063817, "raw_times": [0.08501199999955134, 0.08710200000905388, 0.08719199996676252, 0.0858420000326987, 0.08810299999595372], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09103200000026845, "peak_bytes": 1181184, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S2048_W2", "batch": 2, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08575200001814665, "p50": 0.08690200002092752, "p90": 0.08706200003416598, "mean": 0.08684220001669019, "iqr": 0.00029900002118665725, "raw_times": [0.08773199999723147, 0.08676300001297932, 0.08690200002092752, 0.08706200003416598, 0.08575200001814665], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09036199998035954, "peak_bytes": 4719616, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S2048_W4", "batch": 2, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08490200002597703, "p50": 0.08731200000511308, "p90": 0.0877829999694768, "mean": 0.08806820000017979, "iqr": 0.001451000002816727, "raw_times": [0.09401200003367194, 0.08731200000511308, 0.08633199996666008, 0.08490200002597703, 0.0877829999694768], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0907329999790818, "peak_bytes": 4720128, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S128_W2", "batch": 2, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0847820000444699, "p50": 0.08513199998105847, "p90": 0.08660200001031626, "mean": 0.08566600000676772, "iqr": 0.0016600000094513234, "raw_times": [0.08494200000086494, 0.0847820000444699, 0.08687199999712902, 0.08660200001031626, 0.08513199998105847], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0911219999579771, "peak_bytes": 9461760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S128_W4", "batch": 2, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08356199998615921, "p50": 0.0846430000365217, "p90": 0.08576199996923606, "mean": 0.08508039999242101, "iqr": 0.0011189999895577785, "raw_times": [0.08356199998615921, 0.0867919999905098, 0.08464299997967828, 0.08576199996923606, 0.0846430000365217], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08955300000934585, "peak_bytes": 9478144, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S512_W2", "batch": 2, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08469199997307442, "p50": 0.08614199998646654, "p90": 0.08723299998791845, "mean": 0.08654439999418173, "iqr": 0.0011309999763398082, "raw_times": [0.08469199997307442, 0.08610200001157864, 0.08614199998646654, 0.08855300001187061, 0.08723299998791845], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09115300002804361, "peak_bytes": 37773312, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S512_W4", "batch": 2, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08576300001550408, "p50": 0.08703200001036748, "p90": 0.08823299998539369, "mean": 0.09075460000076419, "iqr": 0.0015310000094359566, "raw_times": [0.10604300001659794, 0.08823299998539369, 0.08703200001036748, 0.08670199997595773, 0.08576300001550408], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08985199997368909, "peak_bytes": 37789696, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S2048_W2", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.14525299997103502, "p50": 0.1457439999512644, "p90": 0.1459139999724357, "mean": 0.1457395999750588, "iqr": 0.00044099999740865314, "raw_times": [0.14525299997103502, 0.14547299997502705, 0.1457439999512644, 0.14631400000553185, 0.1459139999724357], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.1472430000148961, "peak_bytes": 151019520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S2048_W4", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.16037399996093882, "p50": 0.16231400002197915, "p90": 0.16309400001546237, "mean": 0.1622881999992387, "iqr": 0.0012190000120426703, "raw_times": [0.16309400001546237, 0.16231400002197915, 0.16378399999439353, 0.1618750000034197, 0.16037399996093882], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.16341399998509587, "peak_bytes": 151035904, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S128_W2", "batch": 4, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08445299999948475, "p50": 0.08518200002072263, "p90": 0.08666200000106983, "mean": 0.08572240001285536, "iqr": 0.0017899999988912896, "raw_times": [0.08445299999948475, 0.08744300004082106, 0.08518200002072263, 0.08666200000106983, 0.08487200000217854], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0890119999894523, "peak_bytes": 33727488, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S128_W4", "batch": 4, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08437200000344092, "p50": 0.08463200003916427, "p90": 0.08609200000364581, "mean": 0.08522400000856578, "iqr": 0.0015900000107649248, "raw_times": [0.08463200003916427, 0.08609200000364581, 0.08652200000369703, 0.08437200000344092, 0.08450199999288088], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08977199996706986, "peak_bytes": 591360, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S512_W2", "batch": 4, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08375199996635274, "p50": 0.08519199997181204, "p90": 0.08627200003274993, "mean": 0.08607399998936671, "iqr": 0.0020100000597267353, "raw_times": [0.08375199996635274, 0.0842619999730232, 0.08627200003274993, 0.08519199997181204, 0.09089200000289566], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08821199998010343, "peak_bytes": 2360320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S512_W4", "batch": 4, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08470200003785067, "p50": 0.08566200000359458, "p90": 0.08573299999170558, "mean": 0.08566220001284819, "iqr": 0.0006109999617365247, "raw_times": [0.08470200003785067, 0.08709200000112105, 0.08512200002996906, 0.08566200000359458, 0.08573299999170558], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08864200003699807, "peak_bytes": 2360832, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S2048_W2", "batch": 4, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08451200000081371, "p50": 0.08525300000883362, "p90": 0.08580199994412396, "mean": 0.08525219999455658, "iqr": 0.0009299999419454252, "raw_times": [0.08580199994412396, 0.08525300000883362, 0.08451200000081371, 0.08487200000217854, 0.08582200001683304], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08942300001990588, "peak_bytes": 9438208, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S2048_W4", "batch": 4, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08533199996918484, "p50": 0.08693199998788259, "p90": 0.09015199998430035, "mean": 0.08883799998784525, "iqr": 0.0043200000163778896, "raw_times": [0.08533199996918484, 0.09015199998430035, 0.08583199996792246, 0.08693199998788259, 0.09594200002993603], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09176200001093093, "peak_bytes": 9438720, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S128_W2", "batch": 4, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08384200003774822, "p50": 0.08611200001951147, "p90": 0.08663199997727133, "mean": 0.08570400000280642, "iqr": 0.001730000008137722, "raw_times": [0.08384200003774822, 0.08611200001951147, 0.08703200001036748, 0.08663199997727133, 0.08490199996913361], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08941200002254845, "peak_bytes": 18931712, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S128_W4", "batch": 4, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08507300003657292, "p50": 0.0865819999944506, "p90": 0.08741199997075455, "mean": 0.09195439998848087, "iqr": 0.0020300000187489786, "raw_times": [0.11532299998862072, 0.0865819999944506, 0.08741199997075455, 0.08538199995200557, 0.08507300003657292], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08733200002097874, "peak_bytes": 18948096, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S512_W2", "batch": 4, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09419299999535724, "p50": 0.09539199999153425, "p90": 0.09730299996135727, "mean": 0.09678459998667677, "iqr": 0.002380999944762152, "raw_times": [0.10211299996853995, 0.09730299996135727, 0.09492200001659512, 0.09539199999153425, 0.09419299999535724], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09651299995994123, "peak_bytes": 75522048, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S512_W4", "batch": 4, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.10080199996309602, "p50": 0.10192199999892182, "p90": 0.1026219999857858, "mean": 0.10294419998899684, "iqr": 0.0008999999749903509, "raw_times": [0.10765299998638511, 0.10172200001079545, 0.1026219999857858, 0.10192199999892182, 0.10080199996309602], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.10299199999508346, "peak_bytes": 75538432, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S2048_W2", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.4861929999719905, "p50": 0.4890019999947981, "p90": 0.48961200002395344, "mean": 0.48862639999924795, "iqr": 0.001079000014669873, "raw_times": [0.48979199999621414, 0.4861929999719905, 0.48961200002395344, 0.4890019999947981, 0.48853300000928357], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.48705300002893637, "peak_bytes": 302014464, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S2048_W4", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.49736299996538946, "p50": 0.49848299994437184, "p90": 0.49918199999865465, "mean": 0.4987367999774506, "iqr": 0.0007590000450363732, "raw_times": [0.4984229999536183, 0.49848299994437184, 0.49918199999865465, 0.5002330000252186, 0.49736299996538946], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.4985730000157673, "peak_bytes": 302030848, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
diff --git a/causal_conv1d/impls/cells/benchmark.py b/causal_conv1d/impls/cells/benchmark.py
index 725b12c4018e4eec05c5ddccb0c88a8eae6f150d..2e38669a505cbdf181a93e97f31ed1e67ecf4883 100644
--- a/causal_conv1d/impls/cells/benchmark.py
+++ b/causal_conv1d/impls/cells/benchmark.py
@@ -4,28 +4,37 @@
 #     "numpy",
 #     "torch==2.8.0",
 #     "kernels-benchmark-tools",
-#     "kernels",
 # ]
 #
 # [tool.uv.sources]
 # kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
 # ///
 import torch
+import torch.nn.functional as F
 import sys
 from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
-from kernels import get_kernel
 
-# Load the causal conv1d kernel
-causal_conv1d = get_kernel("kernels-community/causal-conv1d")
 
+def torch_causal_conv1d(input_tensor, weight, bias):
+    # Convert to weight dtype for computation
+    x = input_tensor.to(weight.dtype)
+    dim = weight.shape[0]
+    width = weight.shape[1]
+    seqlen = input_tensor.shape[-1]
 
-def hf_kernels_causal_conv1d(input_tensor, weight, bias):
-    return causal_conv1d.causal_conv1d_fn(input_tensor, weight, bias)
+    # Depthwise causal conv1d using PyTorch
+    out = F.conv1d(x, weight.unsqueeze(1), bias, padding=width - 1, groups=dim)
+
+    # Truncate to original sequence length
+    out = out[..., :seqlen]
+
+    # Convert back to original dtype
+    return out.to(input_tensor.dtype)
 
 
 run_benchmark(
     kernel_type=KernelTypeEnum.CAUSAL_CONV1D,
-    impl_name="hf_kernels_causal_conv1d",
-    impl_tags={"family": "hf-kernels", "backend": "cuda"},
-    impl_func=hf_kernels_causal_conv1d,
+    impl_name="torch_eager",
+    impl_tags={"family": "pytorch", "backend": "eager"},
+    impl_func=torch_causal_conv1d,
 )
\ No newline at end of file
diff --git a/causal_conv1d/impls/hf_kernels_causal_conv1d.html b/causal_conv1d/impls/hf_kernels_causal_conv1d.html
index e161062d07cab205d4d881403fd3310ed83e20ca..cb1bde40be01c47bdde38e8da86912f92e3be9c0 100644
--- a/causal_conv1d/impls/hf_kernels_causal_conv1d.html
+++ b/causal_conv1d/impls/hf_kernels_causal_conv1d.html
@@ -4106,11 +4106,12 @@ body[data-tool="eraser"] .main-content {
 <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> | 
-Cell: nv | 0.28s
+Cell: nv | 0.21s
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
 <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
-<a href="https://github.com/huggingface/kernels-uvnotes/blob/main/causal_conv1d/impls/hf_kernels_causal_conv1d.md" target="_blank" class="github-btn">GitHub</a>
+<a href="https://github.com/huggingface/kernels-benchmarks/blob/main/causal_conv1d/impls/hf_kernels_causal_conv1d.md" target="_blank" class="github-btn">GitHub</a>
+<a href="https://huggingface.co/kernels-community/causal-conv1d" target="_blank" class="hf-btn">🤗 HF</a>
 </div>
 <div id="code-nv" class="cell-code" data-lines="2">
 <div class="code-wrap">
@@ -4122,7 +4123,7 @@ Cell: nv | 0.28s
 </div>
 </div>
 <div id="output-nv" class="cell-output">
-<div class="cell-stdout"><pre class="stdout-text">Thu Oct 30 15:51:43 2025       
+<div class="cell-stdout"><pre class="stdout-text">Fri Oct 31 20:00:25 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -4131,7 +4132,7 @@ Cell: nv | 0.28s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   27C    P8             22W /  350W |       0MiB /  46068MiB |      0%      Default |
+| N/A   33C    P0             79W /  350W |       0MiB /  46068MiB |     11%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -4155,11 +4156,12 @@ Cell: nv | 0.28s
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
 </span> | 
-Cell: benchmark | 5.66s
+Cell: benchmark | 9.11s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
-<a href="https://github.com/huggingface/kernels-uvnotes/blob/main/causal_conv1d/impls/hf_kernels_causal_conv1d.md" target="_blank" class="github-btn">GitHub</a>
+<a href="https://github.com/huggingface/kernels-benchmarks/blob/main/causal_conv1d/impls/hf_kernels_causal_conv1d.md" target="_blank" class="github-btn">GitHub</a>
+<a href="https://huggingface.co/kernels-community/causal-conv1d" target="_blank" class="hf-btn">🤗 HF</a>
 </div>
 <div id="code-benchmark" class="cell-code" data-lines="31">
 <div class="code-wrap">
@@ -4208,19 +4210,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     148.031us      3643.39%     148.031us     148.031us             1  
-                               hf_kernels_causal_conv1d         8.90%     165.322us        99.57%       1.851ms       1.851ms       0.000us         0.00%       5.503us       5.503us             1  
-                                         CausalConv1dFn         5.85%     108.724us        90.68%       1.685ms     561.740us       0.000us         0.00%       5.503us       1.834us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.35%      25.159us        81.18%       1.509ms     502.865us       4.063us       100.00%       5.503us       1.834us             3  
-void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.063us       100.00%       4.063us       1.354us             3  
-                                Activity Buffer Request        77.32%       1.437ms        77.32%       1.437ms       1.437ms       1.440us        35.44%       1.440us       1.440us             1  
-                                       aten::empty_like         0.95%      17.630us         3.65%      67.900us      22.633us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         2.70%      50.270us         2.70%      50.270us      16.757us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         2.50%      46.532us         2.50%      46.532us      15.511us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.43%       7.900us         0.43%       7.900us       7.900us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     180.703us      4446.43%     180.703us     180.703us             1  
+                               hf_kernels_causal_conv1d         8.48%     160.534us        99.62%       1.886ms       1.886ms       0.000us         0.00%       5.504us       5.504us             1  
+                                         CausalConv1dFn         6.47%     122.423us        91.15%       1.726ms     575.261us       0.000us         0.00%       5.504us       1.835us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.51%      28.612us        80.84%       1.531ms     510.207us       4.064us       100.00%       5.504us       1.835us             3  
+void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.064us       100.00%       4.064us       1.355us             3  
+                                Activity Buffer Request        76.71%       1.452ms        76.71%       1.452ms       1.452ms       1.440us        35.43%       1.440us       1.440us             1  
+                                       aten::empty_like         1.07%      20.220us         3.84%      72.741us      24.247us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         2.77%      52.521us         2.77%      52.521us      17.507us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         2.62%      49.571us         2.62%      49.571us      16.524us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.38%       7.101us         0.38%       7.101us       7.101us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.858ms
-Self CUDA time total: 4.063us
+Self CPU time total: 1.893ms
+Self CUDA time total: 4.064us
 
 
 
@@ -4230,19 +4232,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     120.926us      3229.86%     120.926us     120.926us             1  
-                               hf_kernels_causal_conv1d         5.72%      96.561us        99.68%       1.683ms       1.683ms       0.000us         0.00%       4.992us       4.992us             1  
-                                         CausalConv1dFn         4.27%      72.072us        93.97%       1.587ms     528.936us       0.000us         0.00%       4.992us       1.664us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.50%      25.350us        87.84%       1.483ms     494.459us       3.744us       100.00%       4.992us       1.664us             3  
-void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.744us       100.00%       3.744us       1.248us             3  
-                                Activity Buffer Request        84.49%       1.427ms        84.49%       1.427ms       1.427ms       1.248us        33.33%       1.248us       1.248us             1  
-                                       aten::empty_like         0.48%       8.160us         1.86%      31.360us      10.453us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.37%      23.200us         1.37%      23.200us       7.733us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         1.85%      31.292us         1.85%      31.292us      10.431us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.32%       5.320us         0.32%       5.320us       5.320us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     125.791us      3331.33%     125.791us     125.791us             1  
+                               hf_kernels_causal_conv1d         5.58%      96.392us        99.64%       1.721ms       1.721ms       0.000us         0.00%       5.056us       5.056us             1  
+                                         CausalConv1dFn         4.40%      76.074us        94.06%       1.625ms     541.671us       0.000us         0.00%       5.056us       1.685us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.52%      26.231us        87.95%       1.519ms     506.473us       3.776us       100.00%       5.056us       1.685us             3  
+void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.776us       100.00%       3.776us       1.259us             3  
+                                Activity Buffer Request        84.56%       1.461ms        84.56%       1.461ms       1.461ms       1.280us        33.90%       1.280us       1.280us             1  
+                                       aten::empty_like         0.44%       7.590us         1.71%      29.520us       9.840us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.27%      21.930us         1.27%      21.930us       7.310us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.87%      32.290us         1.87%      32.290us      10.763us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.36%       6.200us         0.36%       6.200us       6.200us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.689ms
-Self CUDA time total: 3.744us
+Self CPU time total: 1.728ms
+Self CUDA time total: 3.776us
 
 
 
@@ -4252,18 +4254,18 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     122.942us      3255.88%     122.942us     122.942us             1  
-                               hf_kernels_causal_conv1d         6.02%     102.400us        99.66%       1.696ms       1.696ms       0.000us         0.00%       5.023us       5.023us             1  
-                                         CausalConv1dFn         4.37%      74.304us        93.64%       1.594ms     531.323us       0.000us         0.00%       5.023us       1.674us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.51%      25.778us        87.51%       1.490ms     496.532us       3.776us       100.00%       5.023us       1.674us             3  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     125.758us      3330.46%     125.758us     125.758us             1  
+                               hf_kernels_causal_conv1d         5.23%      90.742us        99.66%       1.729ms       1.729ms       0.000us         0.00%       5.056us       5.056us             1  
+                                         CausalConv1dFn         4.39%      76.092us        94.43%       1.638ms     546.081us       0.000us         0.00%       5.056us       1.685us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.50%      26.031us        88.31%       1.532ms     510.660us       3.776us       100.00%       5.056us       1.685us             3  
 void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.776us       100.00%       3.776us       1.259us             3  
-                                Activity Buffer Request        84.19%       1.433ms        84.19%       1.433ms       1.433ms       1.247us        33.02%       1.247us       1.247us             1  
-                                       aten::empty_like         0.48%       8.219us         1.77%      30.070us      10.023us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.28%      21.851us         1.28%      21.851us       7.284us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         1.81%      30.742us         1.81%      30.742us      10.247us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.34%       5.821us         0.34%       5.821us       5.821us       0.000us         0.00%       0.000us       0.000us             1  
+                                Activity Buffer Request        84.98%       1.474ms        84.98%       1.474ms       1.474ms       1.280us        33.90%       1.280us       1.280us             1  
+                                       aten::empty_like         0.47%       8.201us         1.74%      30.171us      10.057us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.27%      21.970us         1.27%      21.970us       7.323us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.83%      31.671us         1.83%      31.671us      10.557us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.34%       5.850us         0.34%       5.850us       5.850us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.702ms
+Self CPU time total: 1.735ms
 Self CUDA time total: 3.776us
 
 
@@ -4274,19 +4276,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     154.975us      4105.30%     154.975us     154.975us             1  
-                               hf_kernels_causal_conv1d         5.10%      97.113us        99.71%       1.897ms       1.897ms       0.000us         0.00%       5.022us       5.022us             1  
-                                         CausalConv1dFn         5.06%      96.320us        94.60%       1.800ms     599.880us       0.000us         0.00%       5.022us       1.674us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.32%      25.153us        87.78%       1.670ms     556.640us       3.775us       100.00%       5.022us       1.674us             3  
-void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.775us       100.00%       3.775us       1.258us             3  
-                                Activity Buffer Request        75.43%       1.435ms        75.43%       1.435ms       1.435ms       1.247us        33.03%       1.247us       1.247us             1  
-                                       aten::empty_like         0.48%       9.119us         1.76%      33.400us      11.133us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.28%      24.281us         1.28%      24.281us       8.094us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        11.03%     209.783us        11.03%     209.783us      69.928us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.29%       5.600us         0.29%       5.600us       5.600us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     127.584us      3350.42%     127.584us     127.584us             1  
+                               hf_kernels_causal_conv1d         4.53%      88.983us        99.75%       1.962ms       1.962ms       0.000us         0.00%       5.088us       5.088us             1  
+                                         CausalConv1dFn         3.93%      77.252us        95.23%       1.873ms     624.219us       0.000us         0.00%       5.088us       1.696us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.36%      26.710us        89.83%       1.766ms     588.805us       3.808us       100.00%       5.088us       1.696us             3  
+void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.808us       100.00%       3.808us       1.269us             3  
+                                Activity Buffer Request        74.34%       1.462ms        74.34%       1.462ms       1.462ms       1.280us        33.61%       1.280us       1.280us             1  
+                                       aten::empty_like         0.41%       8.060us         1.47%      28.990us       9.663us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.06%      20.930us         1.06%      20.930us       6.977us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        14.13%     277.777us        14.13%     277.777us      92.592us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.25%       4.831us         0.25%       4.831us       4.831us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.902ms
-Self CUDA time total: 3.775us
+Self CPU time total: 1.966ms
+Self CUDA time total: 3.808us
 
 
 
@@ -4296,19 +4298,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     127.520us      2656.67%     127.520us     127.520us             1  
-                               hf_kernels_causal_conv1d         5.48%     101.023us        99.67%       1.838ms       1.838ms       0.000us         0.00%       6.400us       6.400us             1  
-                                         CausalConv1dFn         4.02%      74.081us        94.20%       1.737ms     579.070us       0.000us         0.00%       6.400us       2.133us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.41%      25.982us        88.51%       1.632ms     544.113us       4.800us       100.00%       6.400us       2.133us             3  
-void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.800us       100.00%       4.800us       1.600us             3  
-                                Activity Buffer Request        78.02%       1.439ms        78.02%       1.439ms       1.439ms       1.600us        33.33%       1.600us       1.600us             1  
-                                       aten::empty_like         0.45%       8.310us         1.67%      30.790us      10.263us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.22%      22.480us         1.22%      22.480us       7.493us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         9.08%     167.462us         9.08%     167.462us      55.821us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.33%       6.020us         0.33%       6.020us       6.020us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     126.686us      2639.84%     126.686us     126.686us             1  
+                               hf_kernels_causal_conv1d         4.55%      87.622us        99.73%       1.920ms       1.920ms       0.000us         0.00%       6.430us       6.430us             1  
+                                         CausalConv1dFn         3.92%      75.482us        95.18%       1.832ms     610.789us       0.000us         0.00%       6.430us       2.143us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.44%      27.663us        89.66%       1.726ms     575.372us       4.799us       100.00%       6.430us       2.143us             3  
+void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.799us       100.00%       4.799us       1.600us             3  
+                                Activity Buffer Request        74.49%       1.434ms        74.49%       1.434ms       1.434ms       1.631us        33.99%       1.631us       1.631us             1  
+                                       aten::empty_like         0.42%       8.140us         1.60%      30.770us      10.257us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.18%      22.630us         1.18%      22.630us       7.543us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        13.74%     264.526us        13.74%     264.526us      88.175us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.27%       5.120us         0.27%       5.120us       5.120us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.844ms
-Self CUDA time total: 4.800us
+Self CPU time total: 1.925ms
+Self CUDA time total: 4.799us
 
 
 
@@ -4318,19 +4320,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     118.208us      2446.36%     118.208us     118.208us             1  
-                               hf_kernels_causal_conv1d        14.10%      77.840us        98.97%     546.449us     546.449us       0.000us         0.00%       6.464us       6.464us             1  
-                                         CausalConv1dFn        13.03%      71.942us        84.87%     468.609us     156.203us       0.000us         0.00%       6.464us       2.155us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.50%      24.830us        66.59%     367.636us     122.545us       4.832us       100.00%       6.464us       2.155us             3  
-void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.832us       100.00%       4.832us       1.611us             3  
-                                Activity Buffer Request        33.64%     185.743us        33.64%     185.743us     185.743us       1.632us        33.77%       1.632us       1.632us             1  
-                                       aten::empty_like         1.44%       7.931us         5.26%      29.031us       9.677us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         3.82%      21.100us         3.82%      21.100us       7.033us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        28.45%     157.063us        28.45%     157.063us      52.354us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.03%       5.680us         1.03%       5.680us       5.680us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     117.083us      2423.58%     117.083us     117.083us             1  
+                               hf_kernels_causal_conv1d        12.24%      83.203us        99.28%     674.957us     674.957us       0.000us         0.00%       6.463us       6.463us             1  
+                                         CausalConv1dFn        10.43%      70.911us        87.04%     591.754us     197.251us       0.000us         0.00%       6.463us       2.154us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         3.93%      26.710us        72.18%     490.682us     163.561us       4.831us       100.00%       6.463us       2.154us             3  
+void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.831us       100.00%       4.831us       1.610us             3  
+                                Activity Buffer Request        32.42%     220.416us        32.42%     220.416us     220.416us       1.632us        33.78%       1.632us       1.632us             1  
+                                       aten::empty_like         1.07%       7.270us         4.44%      30.161us      10.054us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.37%      22.891us         3.37%      22.891us       7.630us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        35.83%     243.556us        35.83%     243.556us      81.185us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.72%       4.870us         0.72%       4.870us       4.870us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 552.129us
-Self CUDA time total: 4.832us
+Self CPU time total: 679.827us
+Self CUDA time total: 4.831us
 
 
 
@@ -4340,19 +4342,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     129.887us      1226.27%     129.887us     129.887us             1  
-                               hf_kernels_causal_conv1d         5.23%      95.772us        99.69%       1.826ms       1.826ms       0.000us         0.00%      14.144us      14.144us             1  
-                                         CausalConv1dFn         4.13%      75.612us        94.46%       1.730ms     576.726us       0.000us         0.00%      14.144us       4.715us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.41%      25.780us        88.71%       1.625ms     541.586us      10.592us       100.00%      14.144us       4.715us             3  
-void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.592us       100.00%      10.592us       3.531us             3  
-                                Activity Buffer Request        78.55%       1.439ms        78.55%       1.439ms       1.439ms       3.552us        33.53%       3.552us       3.552us             1  
-                                       aten::empty_like         0.48%       8.780us         1.63%      29.810us       9.937us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.15%      21.030us         1.15%      21.030us       7.010us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         8.75%     160.332us         8.75%     160.332us      53.444us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.31%       5.650us         0.31%       5.650us       5.650us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     124.381us      1167.35%     124.381us     124.381us             1  
+                               hf_kernels_causal_conv1d         4.48%      85.542us        99.75%       1.904ms       1.904ms       0.000us         0.00%      14.271us      14.271us             1  
+                                         CausalConv1dFn         3.83%      73.182us        95.27%       1.819ms     606.282us       0.000us         0.00%      14.271us       4.757us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.41%      26.960us        89.88%       1.716ms     571.988us      10.655us       100.00%      14.271us       4.757us             3  
+void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.655us       100.00%      10.655us       3.552us             3  
+                                Activity Buffer Request        76.01%       1.451ms        76.01%       1.451ms       1.451ms       3.616us        33.94%       3.616us       3.616us             1  
+                                       aten::empty_like         0.43%       8.120us         1.56%      29.700us       9.900us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.13%      21.580us         1.13%      21.580us       7.193us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        12.45%     237.787us        12.45%     237.787us      79.262us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.25%       4.860us         0.25%       4.860us       4.860us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.832ms
-Self CUDA time total: 10.592us
+Self CPU time total: 1.909ms
+Self CUDA time total: 10.655us
 
 
 
@@ -4362,19 +4364,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     119.356us      1093.80%     119.356us     119.356us             1  
-                               hf_kernels_causal_conv1d        19.79%      94.221us        98.72%     469.928us     469.928us       0.000us         0.00%      14.592us      14.592us             1  
-                                         CausalConv1dFn        14.74%      70.172us        78.93%     375.707us     125.236us       0.000us         0.00%      14.592us       4.864us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.30%      25.240us        58.06%     276.375us      92.125us      10.912us       100.00%      14.592us       4.864us             3  
-void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.912us       100.00%      10.912us       3.637us             3  
-                                Activity Buffer Request        19.79%      94.192us        19.79%      94.192us      94.192us       3.680us        33.72%       3.680us       3.680us             1  
-                                       aten::empty_like         1.68%       7.980us         6.13%      29.160us       9.720us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.45%      21.180us         4.45%      21.180us       7.060us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        32.97%     156.943us        32.97%     156.943us      52.314us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.28%       6.090us         1.28%       6.090us       6.090us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     122.652us      1120.72%     122.652us     122.652us             1  
+                               hf_kernels_causal_conv1d        12.91%      86.303us        99.27%     663.588us     663.588us       0.000us         0.00%      14.624us      14.624us             1  
+                                         CausalConv1dFn        10.74%      71.821us        86.36%     577.285us     192.428us       0.000us         0.00%      14.624us       4.875us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         3.81%      25.480us        71.21%     476.023us     158.674us      10.944us       100.00%      14.624us       4.875us             3  
+void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.944us       100.00%      10.944us       3.648us             3  
+                                Activity Buffer Request        32.82%     219.426us        32.82%     219.426us     219.426us       3.680us        33.63%       3.680us       3.680us             1  
+                                       aten::empty_like         1.14%       7.591us         4.40%      29.441us       9.814us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.27%      21.850us         3.27%      21.850us       7.283us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        34.57%     231.117us        34.57%     231.117us      77.039us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.73%       4.900us         0.73%       4.900us       4.900us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 476.018us
-Self CUDA time total: 10.912us
+Self CPU time total: 668.488us
+Self CUDA time total: 10.944us
 
 
 
@@ -4384,19 +4386,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     129.375us      1178.71%     129.375us     129.375us             1  
-                               hf_kernels_causal_conv1d         5.38%      99.351us        99.70%       1.840ms       1.840ms       0.000us         0.00%      14.656us      14.656us             1  
-                                         CausalConv1dFn         4.01%      73.942us        94.32%       1.740ms     580.087us       0.000us         0.00%      14.656us       4.885us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.38%      25.552us        88.67%       1.636ms     545.346us      10.976us       100.00%      14.656us       4.885us             3  
-void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.976us       100.00%      10.976us       3.659us             3  
-                                Activity Buffer Request        78.64%       1.451ms        78.64%       1.451ms       1.451ms       3.680us        33.53%       3.680us       3.680us             1  
-                                       aten::empty_like         0.48%       8.800us         1.64%      30.280us      10.093us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.16%      21.480us         1.16%      21.480us       7.160us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         8.64%     159.392us         8.64%     159.392us      53.131us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.30%       5.531us         0.30%       5.531us       5.531us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     130.430us      1181.43%     130.430us     130.430us             1  
+                               hf_kernels_causal_conv1d         4.23%      79.341us        99.73%       1.871ms       1.871ms       0.000us         0.00%      14.784us      14.784us             1  
+                                         CausalConv1dFn         4.03%      75.521us        95.50%       1.792ms     597.206us       0.000us         0.00%      14.784us       4.928us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.43%      26.810us        89.82%       1.685ms     561.675us      11.040us       100.00%      14.784us       4.928us             3  
+void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      11.040us       100.00%      11.040us       3.680us             3  
+                                Activity Buffer Request        77.07%       1.446ms        77.07%       1.446ms       1.446ms       3.744us        33.91%       3.744us       3.744us             1  
+                                       aten::empty_like         0.44%       8.272us         1.66%      31.072us      10.357us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.22%      22.800us         1.22%      22.800us       7.600us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        11.32%     212.286us        11.32%     212.286us      70.762us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.27%       5.130us         0.27%       5.130us       5.130us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.845ms
-Self CUDA time total: 10.976us
+Self CPU time total: 1.876ms
+Self CUDA time total: 11.040us
 
 
 
@@ -4406,19 +4408,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     123.679us      1104.47%     123.679us     123.679us             1  
-                               hf_kernels_causal_conv1d        17.75%      87.860us        98.92%     489.618us     489.618us       0.000us         0.00%      14.974us      14.974us             1  
-                                         CausalConv1dFn        14.77%      73.091us        81.17%     401.758us     133.919us       0.000us         0.00%      14.974us       4.991us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.42%      26.830us        60.45%     299.195us      99.732us      11.198us       100.00%      14.974us       4.991us             3  
-void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      11.198us       100.00%      11.198us       3.733us             3  
-                                Activity Buffer Request        20.28%     100.392us        20.28%     100.392us     100.392us       3.776us        33.72%       3.776us       3.776us             1  
-                                       aten::empty_like         1.69%       8.381us         5.95%      29.472us       9.824us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.26%      21.091us         4.26%      21.091us       7.030us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        34.75%     171.973us        34.75%     171.973us      57.324us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.08%       5.331us         1.08%       5.331us       5.331us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     120.097us      1060.18%     120.097us     120.097us             1  
+                               hf_kernels_causal_conv1d        13.35%      76.301us        99.17%     566.674us     566.674us       0.000us         0.00%      15.168us      15.168us             1  
+                                         CausalConv1dFn        12.80%      73.153us        85.81%     490.373us     163.458us       0.000us         0.00%      15.168us       5.056us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.71%      26.911us        68.00%     388.569us     129.523us      11.328us       100.00%      15.168us       5.056us             3  
+void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      11.328us       100.00%      11.328us       3.776us             3  
+                                Activity Buffer Request        34.49%     197.075us        34.49%     197.075us     197.075us       3.840us        33.90%       3.840us       3.840us             1  
+                                       aten::empty_like         1.29%       7.379us         5.01%      28.651us       9.550us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.72%      21.272us         3.72%      21.272us       7.091us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        28.80%     164.583us        28.80%     164.583us      54.861us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.83%       4.760us         0.83%       4.760us       4.760us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 494.949us
-Self CUDA time total: 11.198us
+Self CPU time total: 571.434us
+Self CUDA time total: 11.328us
 
 
 
@@ -4428,19 +4430,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     132.959us       264.31%     132.959us     132.959us             1  
-                               hf_kernels_causal_conv1d         5.33%      97.801us        99.71%       1.830ms       1.830ms       0.000us         0.00%      83.968us      83.968us             1  
-                                         CausalConv1dFn         4.03%      73.903us        94.38%       1.732ms     577.264us       0.000us         0.00%      83.968us      27.989us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.44%      26.339us        88.71%       1.628ms     542.606us      50.304us       100.00%      83.968us      27.989us             3  
-void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      50.304us       100.00%      50.304us      16.768us             3  
-                                Activity Buffer Request        78.52%       1.441ms        78.52%       1.441ms       1.441ms      33.664us        66.92%      33.664us      33.664us             1  
-                                       aten::empty_like         0.46%       8.510us         1.64%      30.070us      10.023us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.17%      21.560us         1.17%      21.560us       7.187us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         8.75%     160.594us         8.75%     160.594us      53.531us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.29%       5.400us         0.29%       5.400us       5.400us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     133.919us       265.71%     133.919us     133.919us             1  
+                               hf_kernels_causal_conv1d         4.38%      80.552us        99.73%       1.836ms       1.836ms       0.000us         0.00%      83.873us      83.873us             1  
+                                         CausalConv1dFn         4.09%      75.353us        95.35%       1.755ms     585.145us       0.000us         0.00%      83.873us      27.958us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.33%      24.410us        89.50%       1.648ms     549.264us      50.401us       100.00%      83.873us      27.958us             3  
+void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      50.401us       100.00%      50.401us      16.800us             3  
+                                Activity Buffer Request        79.01%       1.455ms        79.01%       1.455ms       1.455ms      33.472us        66.41%      33.472us      33.472us             1  
+                                       aten::empty_like         0.45%       8.369us         1.75%      32.290us      10.763us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.30%      23.921us         1.30%      23.921us       7.974us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         9.17%     168.764us         9.17%     168.764us      56.255us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.27%       5.020us         0.27%       5.020us       5.020us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.835ms
-Self CUDA time total: 50.304us
+Self CPU time total: 1.841ms
+Self CUDA time total: 50.401us
 
 
 
@@ -4450,18 +4452,18 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     125.085us       244.46%     125.085us     125.085us             1  
-                               hf_kernels_causal_conv1d        15.91%      74.080us        98.78%     459.898us     459.898us       0.000us         0.00%      85.694us      85.694us             1  
-                                         CausalConv1dFn        15.58%      72.521us        82.87%     385.818us     128.606us       0.000us         0.00%      85.694us      28.565us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.92%      27.572us        61.05%     284.236us      94.745us      51.167us       100.00%      85.694us      28.565us             3  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     131.005us       256.03%     131.005us     131.005us             1  
+                               hf_kernels_causal_conv1d        11.69%      77.241us        99.25%     655.717us     655.717us       0.000us         0.00%      85.534us      85.534us             1  
+                                         CausalConv1dFn        10.97%      72.503us        87.56%     578.476us     192.825us       0.000us         0.00%      85.534us      28.511us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         3.89%      25.692us        71.76%     474.103us     158.034us      51.167us       100.00%      85.534us      28.511us             3  
 void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      51.167us       100.00%      51.167us      17.056us             3  
-                                Activity Buffer Request        21.78%     101.412us        21.78%     101.412us     101.412us      34.527us        67.48%      34.527us      34.527us             1  
-                                       aten::empty_like         1.68%       7.830us         6.24%      29.061us       9.687us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.56%      21.231us         4.56%      21.231us       7.077us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        33.35%     155.252us        33.35%     155.252us      51.751us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.22%       5.680us         1.22%       5.680us       5.680us       0.000us         0.00%       0.000us       0.000us             1  
+                                Activity Buffer Request        43.08%     284.587us        43.08%     284.587us     284.587us      34.367us        67.17%      34.367us      34.367us             1  
+                                       aten::empty_like         1.14%       7.549us         4.82%      31.870us      10.623us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.68%      24.321us         3.68%      24.321us       8.107us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        24.80%     163.824us        24.80%     163.824us      54.608us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.75%       4.929us         0.75%       4.929us       4.929us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 465.578us
+Self CPU time total: 660.646us
 Self CUDA time total: 51.167us
 
 
@@ -4472,19 +4474,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     123.583us      3164.74%     123.583us     123.583us             1  
-                               hf_kernels_causal_conv1d         8.70%      75.560us        99.36%     863.215us     863.215us       0.000us         0.00%       5.153us       5.153us             1  
-                                         CausalConv1dFn         8.33%      72.353us        90.66%     787.655us     262.552us       0.000us         0.00%       5.153us       1.718us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         2.88%      25.000us        78.85%     685.062us     228.354us       3.905us       100.00%       5.153us       1.718us             3  
-void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.905us       100.00%       3.905us       1.302us             3  
-                                Activity Buffer Request        57.61%     500.499us        57.61%     500.499us     500.499us       1.248us        31.96%       1.248us       1.248us             1  
-                                       aten::empty_like         0.96%       8.370us         3.48%      30.240us      10.080us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         2.52%      21.870us         2.52%      21.870us       7.290us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        18.37%     159.563us        18.37%     159.563us      53.188us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.64%       5.560us         0.64%       5.560us       5.560us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     118.686us      3040.89%     118.686us     118.686us             1  
+                               hf_kernels_causal_conv1d        11.60%      73.750us        99.24%     631.216us     631.216us       0.000us         0.00%       5.183us       5.183us             1  
+                                         CausalConv1dFn        11.30%      71.845us        87.65%     557.466us     185.822us       0.000us         0.00%       5.183us       1.728us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.22%      26.861us        71.87%     457.101us     152.367us       3.903us       100.00%       5.183us       1.728us             3  
+void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.903us       100.00%       3.903us       1.301us             3  
+                                Activity Buffer Request        42.38%     269.577us        42.38%     269.577us     269.577us       1.280us        32.80%       1.280us       1.280us             1  
+                                       aten::empty_like         1.23%       7.810us         4.48%      28.520us       9.507us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.26%      20.710us         3.26%      20.710us       6.903us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        25.26%     160.663us        25.26%     160.663us      53.554us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.76%       4.821us         0.76%       4.821us       4.821us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 868.775us
-Self CUDA time total: 3.905us
+Self CPU time total: 636.037us
+Self CUDA time total: 3.903us
 
 
 
@@ -4494,19 +4496,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     118.845us      3044.19%     118.845us     118.845us             1  
-                               hf_kernels_causal_conv1d        16.55%      74.260us        98.76%     443.077us     443.077us       0.000us         0.00%       5.152us       5.152us             1  
-                                         CausalConv1dFn        15.87%      71.182us        82.21%     368.817us     122.939us       0.000us         0.00%       5.152us       1.717us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.48%      24.591us        59.34%     266.204us      88.735us       3.904us       100.00%       5.152us       1.717us             3  
-void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.904us       100.00%       3.904us       1.301us             3  
-                                Activity Buffer Request        18.72%      83.961us        18.72%      83.961us      83.961us       1.248us        31.97%       1.248us       1.248us             1  
-                                       aten::empty_like         1.83%       8.189us         7.01%      31.431us      10.477us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         5.18%      23.242us         5.18%      23.242us       7.747us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        35.14%     157.652us        35.14%     157.652us      52.551us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.24%       5.551us         1.24%       5.551us       5.551us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     120.221us      3029.76%     120.221us     120.221us             1  
+                               hf_kernels_causal_conv1d        13.01%      75.082us        99.09%     571.775us     571.775us       0.000us         0.00%       5.248us       5.248us             1  
+                                         CausalConv1dFn        12.35%      71.241us        86.08%     496.693us     165.564us       0.000us         0.00%       5.248us       1.749us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.88%      28.181us        68.58%     395.720us     131.907us       3.968us       100.00%       5.248us       1.749us             3  
+void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.968us       100.00%       3.968us       1.323us             3  
+                                Activity Buffer Request        36.26%     209.246us        36.26%     209.246us     209.246us       1.280us        32.26%       1.280us       1.280us             1  
+                                       aten::empty_like         1.42%       8.172us         5.15%      29.732us       9.911us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.74%      21.560us         3.74%      21.560us       7.187us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        27.43%     158.293us        27.43%     158.293us      52.764us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.91%       5.270us         0.91%       5.270us       5.270us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 448.628us
-Self CUDA time total: 3.904us
+Self CPU time total: 577.045us
+Self CUDA time total: 3.968us
 
 
 
@@ -4516,19 +4518,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     122.816us      3046.03%     122.816us     122.816us             1  
-                               hf_kernels_causal_conv1d         8.66%      75.390us        99.38%     865.505us     865.505us       0.000us         0.00%       5.376us       5.376us             1  
-                                         CausalConv1dFn         8.40%      73.201us        90.72%     790.115us     263.372us       0.000us         0.00%       5.376us       1.792us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         3.02%      26.261us        78.90%     687.193us     229.064us       4.032us       100.00%       5.376us       1.792us             3  
-void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.032us       100.00%       4.032us       1.344us             3  
-                                Activity Buffer Request        57.07%     497.089us        57.07%     497.089us     497.089us       1.344us        33.33%       1.344us       1.344us             1  
-                                       aten::empty_like         0.93%       8.130us         3.41%      29.721us       9.907us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         2.48%      21.591us         2.48%      21.591us       7.197us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        18.81%     163.843us        18.81%     163.843us      54.614us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.62%       5.440us         0.62%       5.440us       5.440us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     117.374us      2843.36%     117.374us     117.374us             1  
+                               hf_kernels_causal_conv1d        14.38%      74.792us        98.97%     514.843us     514.843us       0.000us         0.00%       5.504us       5.504us             1  
+                                         CausalConv1dFn        13.25%      68.940us        84.59%     440.051us     146.684us       0.000us         0.00%       5.504us       1.835us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.99%      25.981us        65.51%     340.779us     113.593us       4.128us       100.00%       5.504us       1.835us             3  
+void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.128us       100.00%       4.128us       1.376us             3  
+                                Activity Buffer Request        29.84%     155.214us        29.84%     155.214us     155.214us       1.376us        33.33%       1.376us       1.376us             1  
+                                       aten::empty_like         1.55%       8.080us         5.83%      30.332us      10.111us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.28%      22.252us         4.28%      22.252us       7.417us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        30.68%     159.584us        30.68%     159.584us      53.195us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.03%       5.380us         1.03%       5.380us       5.380us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 870.945us
-Self CUDA time total: 4.032us
+Self CPU time total: 520.223us
+Self CUDA time total: 4.128us
 
 
 
@@ -4538,18 +4540,18 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     116.446us      2866.01%     116.446us     116.446us             1  
-                               hf_kernels_causal_conv1d        16.24%      74.671us        98.84%     454.378us     454.378us       0.000us         0.00%       5.407us       5.407us             1  
-                                         CausalConv1dFn        15.28%      70.221us        82.60%     379.707us     126.569us       0.000us         0.00%       5.407us       1.802us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.99%      27.540us        61.00%     280.405us      93.468us       4.063us       100.00%       5.407us       1.802us             3  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     116.831us      2875.49%     116.831us     116.831us             1  
+                               hf_kernels_causal_conv1d        13.78%      75.282us        99.09%     541.484us     541.484us       0.000us         0.00%       5.439us       5.439us             1  
+                                         CausalConv1dFn        12.58%      68.741us        85.32%     466.202us     155.401us       0.000us         0.00%       5.439us       1.813us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.76%      26.021us        67.34%     367.980us     122.660us       4.063us       100.00%       5.439us       1.813us             3  
 void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.063us       100.00%       4.063us       1.354us             3  
-                                Activity Buffer Request        21.14%      97.192us        21.14%      97.192us      97.192us       1.344us        33.08%       1.344us       1.344us             1  
-                                       aten::empty_like         1.73%       7.931us         6.33%      29.081us       9.694us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.60%      21.150us         4.60%      21.150us       7.050us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        33.86%     155.673us        33.86%     155.673us      51.891us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.16%       5.330us         1.16%       5.330us       5.330us       0.000us         0.00%       0.000us       0.000us             1  
+                                Activity Buffer Request        33.52%     183.175us        33.52%     183.175us     183.175us       1.376us        33.87%       1.376us       1.376us             1  
+                                       aten::empty_like         1.37%       7.489us         5.40%      29.481us       9.827us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.02%      21.992us         4.02%      21.992us       7.331us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        29.06%     158.784us        29.06%     158.784us      52.928us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.91%       4.951us         0.91%       4.951us       4.951us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 459.708us
+Self CPU time total: 546.435us
 Self CUDA time total: 4.063us
 
 
@@ -4560,19 +4562,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     120.895us      2262.26%     120.895us     120.895us             1  
-                               hf_kernels_causal_conv1d        10.03%      75.040us        99.26%     742.432us     742.432us       0.000us         0.00%       7.136us       7.136us             1  
-                                         CausalConv1dFn         9.57%      71.601us        89.23%     667.392us     222.464us       0.000us         0.00%       7.136us       2.379us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         3.57%      26.722us        75.60%     565.480us     188.493us       5.344us       100.00%       7.136us       2.379us             3  
-void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       5.344us       100.00%       5.344us       1.781us             3  
-                                Activity Buffer Request        50.95%     381.056us        50.95%     381.056us     381.056us       1.792us        33.53%       1.792us       1.792us             1  
-                                       aten::empty_like         1.09%       8.161us         4.05%      30.311us      10.104us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         2.96%      22.150us         2.96%      22.150us       7.383us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        21.08%     157.702us        21.08%     157.702us      52.567us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.74%       5.510us         0.74%       5.510us       5.510us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     119.806us      2228.53%     119.806us     119.806us             1  
+                               hf_kernels_causal_conv1d        11.93%      76.073us        99.21%     632.507us     632.507us       0.000us         0.00%       7.200us       7.200us             1  
+                                         CausalConv1dFn        11.21%      71.480us        87.28%     556.434us     185.478us       0.000us         0.00%       7.200us       2.400us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.13%      26.361us        71.46%     455.612us     151.871us       5.376us       100.00%       7.200us       2.400us             3  
+void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       5.376us       100.00%       5.376us       1.792us             3  
+                                Activity Buffer Request        42.49%     270.867us        42.49%     270.867us     270.867us       1.824us        33.93%       1.824us       1.824us             1  
+                                       aten::empty_like         1.24%       7.892us         4.60%      29.342us       9.781us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.36%      21.450us         3.36%      21.450us       7.150us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        24.84%     158.384us        24.84%     158.384us      52.795us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.79%       5.050us         0.79%       5.050us       5.050us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 747.942us
-Self CUDA time total: 5.344us
+Self CPU time total: 637.557us
+Self CUDA time total: 5.376us
 
 
 
@@ -4582,19 +4584,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     114.428us      2091.54%     114.428us     114.428us             1  
-                               hf_kernels_causal_conv1d        15.93%      72.612us        98.81%     450.477us     450.477us       0.000us         0.00%       7.327us       7.327us             1  
-                                         CausalConv1dFn        15.28%      69.671us        82.88%     377.865us     125.955us       0.000us         0.00%       7.327us       2.442us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.81%      26.480us        61.42%     279.994us      93.331us       5.471us       100.00%       7.327us       2.442us             3  
-void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       5.471us       100.00%       5.471us       1.824us             3  
-                                Activity Buffer Request        21.45%      97.772us        21.45%      97.772us      97.772us       1.856us        33.92%       1.856us       1.856us             1  
-                                       aten::empty_like         1.75%       7.980us         6.19%      28.200us       9.400us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.44%      20.220us         4.44%      20.220us       6.740us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        34.16%     155.742us        34.16%     155.742us      51.914us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.19%       5.420us         1.19%       5.420us       5.420us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     119.676us      2174.35%     119.676us     119.676us             1  
+                               hf_kernels_causal_conv1d        14.25%      74.352us        99.01%     516.513us     516.513us       0.000us         0.00%       7.392us       7.392us             1  
+                                         CausalConv1dFn        14.02%      73.122us        84.76%     442.161us     147.387us       0.000us         0.00%       7.392us       2.464us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.04%      26.281us        65.18%     340.038us     113.346us       5.504us       100.00%       7.392us       2.464us             3  
+void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       5.504us       100.00%       5.504us       1.835us             3  
+                                Activity Buffer Request        30.19%     157.524us        30.19%     157.524us     157.524us       1.888us        34.30%       1.888us       1.888us             1  
+                                       aten::empty_like         1.50%       7.800us         5.56%      29.001us       9.667us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.06%      21.201us         4.06%      21.201us       7.067us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        29.95%     156.233us        29.95%     156.233us      52.078us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.99%       5.180us         0.99%       5.180us       5.180us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 455.897us
-Self CUDA time total: 5.471us
+Self CPU time total: 521.693us
+Self CUDA time total: 5.504us
 
 
 
@@ -4604,19 +4606,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     124.251us       717.80%     124.251us     124.251us             1  
-                               hf_kernels_causal_conv1d        10.05%      75.520us        99.24%     745.563us     745.563us       0.000us         0.00%      23.101us      23.101us             1  
-                                         CausalConv1dFn         9.33%      70.111us        89.19%     670.043us     223.348us       0.000us         0.00%      23.101us       7.700us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         3.43%      25.770us        75.92%     570.342us     190.114us      17.310us       100.00%      23.101us       7.700us             3  
-void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.310us       100.00%      17.310us       5.770us             3  
-                                Activity Buffer Request        51.18%     384.497us        51.18%     384.497us     384.497us       5.791us        33.45%       5.791us       5.791us             1  
-                                       aten::empty_like         1.14%       8.540us         3.94%      29.590us       9.863us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         2.80%      21.050us         2.80%      21.050us       7.017us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        21.31%     160.075us        21.31%     160.075us      53.358us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.76%       5.680us         0.76%       5.680us       5.680us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     124.798us       715.63%     124.798us     124.798us             1  
+                               hf_kernels_causal_conv1d        11.85%      75.293us        99.15%     630.167us     630.167us       0.000us         0.00%      23.295us      23.295us             1  
+                                         CausalConv1dFn        11.06%      70.310us        87.30%     554.874us     184.958us       0.000us         0.00%      23.295us       7.765us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.18%      26.540us        71.39%     453.732us     151.244us      17.439us       100.00%      23.295us       7.765us             3  
+void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.439us       100.00%      17.439us       5.813us             3  
+                                Activity Buffer Request        42.20%     268.237us        42.20%     268.237us     268.237us       5.856us        33.58%       5.856us       5.856us             1  
+                                       aten::empty_like         1.25%       7.951us         4.85%      30.832us      10.277us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.60%      22.881us         3.60%      22.881us       7.627us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        25.01%     158.955us        25.01%     158.955us      52.985us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.85%       5.410us         0.85%       5.410us       5.410us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 751.243us
-Self CUDA time total: 17.310us
+Self CPU time total: 635.577us
+Self CUDA time total: 17.439us
 
 
 
@@ -4626,19 +4628,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     121.596us       682.20%     121.596us     121.596us             1  
-                               hf_kernels_causal_conv1d        16.81%      75.551us        98.76%     443.797us     443.797us       0.000us         0.00%      23.808us      23.808us             1  
-                                         CausalConv1dFn        15.22%      68.400us        81.95%     368.246us     122.749us       0.000us         0.00%      23.808us       7.936us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.83%      26.181us        60.07%     269.934us      89.978us      17.824us       100.00%      23.808us       7.936us             3  
-void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.824us       100.00%      17.824us       5.941us             3  
-                                Activity Buffer Request        19.24%      86.441us        19.24%      86.441us      86.441us       5.984us        33.57%       5.984us       5.984us             1  
-                                       aten::empty_like         1.76%       7.900us         6.66%      29.912us       9.971us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.90%      22.012us         4.90%      22.012us       7.337us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        35.01%     157.312us        35.01%     157.312us      52.437us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.24%       5.550us         1.24%       5.550us       5.550us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     124.252us       695.89%     124.252us     124.252us             1  
+                               hf_kernels_causal_conv1d        15.28%      76.213us        99.04%     494.053us     494.053us       0.000us         0.00%      23.839us      23.839us             1  
+                                         CausalConv1dFn        14.60%      72.841us        83.76%     417.840us     139.280us       0.000us         0.00%      23.839us       7.946us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.38%      26.851us        63.27%     315.607us     105.202us      17.855us       100.00%      23.839us       7.946us             3  
+void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.855us       100.00%      17.855us       5.952us             3  
+                                Activity Buffer Request        26.40%     131.703us        26.40%     131.703us     131.703us       5.984us        33.51%       5.984us       5.984us             1  
+                                       aten::empty_like         1.62%       8.090us         5.89%      29.392us       9.797us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.27%      21.302us         4.27%      21.302us       7.101us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        31.48%     157.053us        31.48%     157.053us      52.351us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.96%       4.810us         0.96%       4.810us       4.810us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 449.347us
-Self CUDA time total: 17.824us
+Self CPU time total: 498.863us
+Self CUDA time total: 17.855us
 
 
 
@@ -4648,19 +4650,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     122.077us       686.13%     122.077us     122.077us             1  
-                               hf_kernels_causal_conv1d        12.00%      91.181us        99.29%     754.243us     754.243us       0.000us         0.00%      23.808us      23.808us             1  
-                                         CausalConv1dFn         9.45%      71.802us        87.29%     663.062us     221.021us       0.000us         0.00%      23.808us       7.936us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         3.27%      24.831us        73.88%     561.180us     187.060us      17.792us       100.00%      23.808us       7.936us             3  
-void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.792us       100.00%      17.792us       5.931us             3  
-                                Activity Buffer Request        49.89%     378.947us        49.89%     378.947us     378.947us       6.016us        33.81%       6.016us       6.016us             1  
-                                       aten::empty_like         1.06%       8.020us         3.96%      30.080us      10.027us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         2.90%      22.060us         2.90%      22.060us       7.353us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        20.72%     157.402us        20.72%     157.402us      52.467us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.71%       5.381us         0.71%       5.381us       5.381us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     124.253us       695.94%     124.253us     124.253us             1  
+                               hf_kernels_causal_conv1d        14.09%      92.581us        99.22%     652.096us     652.096us       0.000us         0.00%      23.838us      23.838us             1  
+                                         CausalConv1dFn        11.45%      75.254us        85.13%     559.515us     186.505us       0.000us         0.00%      23.838us       7.946us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         3.84%      25.251us        69.30%     455.481us     151.827us      17.854us       100.00%      23.838us       7.946us             3  
+void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.854us       100.00%      17.854us       5.951us             3  
+                                Activity Buffer Request        41.42%     272.247us        41.42%     272.247us     272.247us       5.984us        33.52%       5.984us       5.984us             1  
+                                       aten::empty_like         1.19%       7.849us         4.38%      28.780us       9.593us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.18%      20.931us         3.18%      20.931us       6.977us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        24.04%     157.983us        24.04%     157.983us      52.661us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.78%       5.140us         0.78%       5.140us       5.140us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 759.624us
-Self CUDA time total: 17.792us
+Self CPU time total: 657.236us
+Self CUDA time total: 17.854us
 
 
 
@@ -4670,19 +4672,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     124.351us       671.15%     124.351us     124.351us             1  
-                               hf_kernels_causal_conv1d        19.13%      92.321us        98.80%     476.748us     476.748us       0.000us         0.00%      24.736us      24.736us             1  
-                                         CausalConv1dFn        14.83%      71.551us        79.67%     384.427us     128.142us       0.000us         0.00%      24.736us       8.245us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.89%      28.409us        58.58%     282.676us      94.225us      18.528us       100.00%      24.736us       8.245us             3  
-void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      18.528us       100.00%      18.528us       6.176us             3  
-                                Activity Buffer Request        20.26%      97.782us        20.26%      97.782us      97.782us       6.208us        33.51%       6.208us       6.208us             1  
-                                       aten::empty_like         1.73%       8.360us         6.26%      30.200us      10.067us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.53%      21.840us         4.53%      21.840us       7.280us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        32.43%     156.485us        32.43%     156.485us      52.162us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.20%       5.770us         1.20%       5.770us       5.770us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     121.982us       651.61%     121.982us     121.982us             1  
+                               hf_kernels_causal_conv1d        16.26%      76.273us        99.00%     464.343us     464.343us       0.000us         0.00%      25.088us      25.088us             1  
+                                         CausalConv1dFn        15.20%      71.302us        82.74%     388.070us     129.357us       0.000us         0.00%      25.088us       8.363us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.49%      25.750us        61.15%     286.808us      95.603us      18.720us       100.00%      25.088us       8.363us             3  
+void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      18.720us       100.00%      18.720us       6.240us             3  
+                                Activity Buffer Request        22.13%     103.813us        22.13%     103.813us     103.813us       6.368us        34.02%       6.368us       6.368us             1  
+                                       aten::empty_like         1.75%       8.210us         6.39%      29.960us       9.987us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.64%      21.750us         4.64%      21.750us       7.250us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        33.53%     157.245us        33.53%     157.245us      52.415us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.00%       4.680us         1.00%       4.680us       4.680us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 482.518us
-Self CUDA time total: 18.528us
+Self CPU time total: 469.023us
+Self CUDA time total: 18.720us
 
 
 
@@ -4692,19 +4694,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d         5.47%     101.271us        99.69%       1.845ms       1.845ms       0.000us         0.00%     162.913us     162.913us             1  
-                                         CausalConv1dFn         4.05%      75.021us        94.22%       1.743ms     581.104us       0.000us         0.00%     162.913us      54.304us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.32%      24.372us        88.46%       1.637ms     545.603us      97.697us       100.00%     162.913us      54.304us             3  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     139.807us       143.10%     139.807us     139.807us             1  
-void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      97.697us       100.00%      97.697us      32.566us             3  
-                                Activity Buffer Request        78.43%       1.451ms        78.43%       1.451ms       1.451ms      65.216us        66.75%      65.216us      65.216us             1  
-                                       aten::empty_like         0.45%       8.320us         1.70%      31.480us      10.493us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         1.25%      23.160us         1.25%      23.160us       7.720us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         8.71%     161.192us         8.71%     161.192us      53.731us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         0.31%       5.721us         0.31%       5.721us       5.721us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d         4.40%      80.973us        99.73%       1.837ms       1.837ms       0.000us         0.00%     162.749us     162.749us             1  
+                                         CausalConv1dFn         4.14%      76.301us        95.33%       1.756ms     585.285us       0.000us         0.00%     162.749us      54.250us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.45%      26.730us        89.50%       1.648ms     549.474us      97.918us       100.00%     162.749us      54.250us             3  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     141.950us       144.97%     141.950us     141.950us             1  
+void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      97.918us       100.00%      97.918us      32.639us             3  
+                                Activity Buffer Request        78.99%       1.455ms        78.99%       1.455ms       1.455ms      64.831us        66.21%      64.831us      64.831us             1  
+                                       aten::empty_like         0.45%       8.340us         1.69%      31.131us      10.377us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.24%      22.791us         1.24%      22.791us       7.597us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         9.06%     166.885us         9.06%     166.885us      55.628us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.27%       4.980us         0.27%       4.980us       4.980us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.850ms
-Self CUDA time total: 97.697us
+Self CPU time total: 1.842ms
+Self CUDA time total: 97.918us
 
 
 
@@ -4714,19 +4716,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                               hf_kernels_causal_conv1d        19.60%      95.701us        98.90%     482.848us     482.848us       0.000us         0.00%     163.744us     163.744us             1  
-                                         CausalConv1dFn        15.21%      74.281us        79.29%     387.147us     129.049us       0.000us         0.00%     163.744us      54.581us             3  
-              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.67%      27.701us        57.93%     282.846us      94.282us      98.688us       100.00%     163.744us      54.581us             3  
-                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     139.968us       141.83%     139.968us     139.968us             1  
-void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      98.688us       100.00%      98.688us      32.896us             3  
-                                Activity Buffer Request        19.94%      97.362us        19.94%      97.362us      97.362us      65.056us        65.92%      65.056us      65.056us             1  
-                                       aten::empty_like         1.68%       8.190us         6.15%      30.020us      10.007us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         4.47%      21.830us         4.47%      21.830us       7.277us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel        32.32%     157.783us        32.32%     157.783us      52.594us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize         1.10%       5.391us         1.10%       5.391us       5.391us       0.000us         0.00%       0.000us       0.000us             1  
+                               hf_kernels_causal_conv1d        16.07%      76.871us        98.94%     473.172us     473.172us       0.000us         0.00%     163.803us     163.803us             1  
+                                         CausalConv1dFn        14.96%      71.532us        82.87%     396.301us     132.100us       0.000us         0.00%     163.803us      54.601us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.75%      27.501us        61.56%     294.418us      98.139us      98.685us       100.00%     163.803us      54.601us             3  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     133.180us       134.95%     133.180us     133.180us             1  
+void causal_conv1d_fwd_kernel&lt;Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      98.685us       100.00%      98.685us      32.895us             3  
+                                Activity Buffer Request        21.65%     103.543us        21.65%     103.543us     103.543us      65.118us        65.99%      65.118us      65.118us             1  
+                                       aten::empty_like         1.52%       7.251us         6.35%      30.351us      10.117us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.83%      23.100us         4.83%      23.100us       7.700us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        34.16%     163.374us        34.16%     163.374us      54.458us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.06%       5.061us         1.06%       5.061us       5.061us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 488.239us
-Self CUDA time total: 98.688us
+Self CPU time total: 478.233us
+Self CUDA time total: 98.685us
 
 
 impl                     wl                  p50(ms)  ok
@@ -4758,13 +4760,13 @@ hf_kernels_causal_conv1d cuda_B4_D64_S512_W4     0.05  True
 <div class="uv-install-logs" id="uv-logs-benchmark">
 <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 <div class="uv-logs-content" style="display: none;">
-Installed 15 packages in 14ms
+Installed 52 packages in 240ms
 </div>
 </div>
 <div class="cell-stderr">Fetching 11 files:   0%|          | 0/11 [00:00&lt;?, ?it/s]
-Fetching 11 files:   9%|▉         | 1/11 [00:00&lt;00:01,  6.41it/s]
-Fetching 11 files:  64%|██████▎   | 7/11 [00:01&lt;00:00,  4.26it/s]
-Fetching 11 files: 100%|██████████| 11/11 [00:01&lt;00:00,  6.78it/s]</div>
+Fetching 11 files:   9%|▉         | 1/11 [00:00&lt;00:01,  9.42it/s]
+Fetching 11 files:  64%|██████▎   | 7/11 [00:01&lt;00:00,  4.98it/s]
+Fetching 11 files: 100%|██████████| 11/11 [00:01&lt;00:00,  7.98it/s]</div>
 <div class="cell-artifacts">
 <h4>Artifacts:</h4>
 <a href="artifacts/benchmark/causal_conv1d.jsonl" class="artifact" target="_blank">causal_conv1d.jsonl</a>
diff --git a/causal_conv1d/impls/torch_causal_conv1d.html b/causal_conv1d/impls/torch_causal_conv1d.html
index afa77a96b2a763e56f05f85b5cc1eef91c17fd17..6358d2b943cf22bb9f31aeb2e669932f13397132 100644
--- a/causal_conv1d/impls/torch_causal_conv1d.html
+++ b/causal_conv1d/impls/torch_causal_conv1d.html
@@ -4106,11 +4106,11 @@ body[data-tool="eraser"] .main-content {
 <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> | 
-Cell: nv | 0.28s
+Cell: nv | 0.21s
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
 <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
-<a href="https://github.com/huggingface/kernels-uvnotes/blob/main/causal_conv1d/impls/torch_causal_conv1d.md" target="_blank" class="github-btn">GitHub</a>
+<a href="https://github.com/huggingface/kernels-benchmarks/blob/main/causal_conv1d/impls/torch_causal_conv1d.md" target="_blank" class="github-btn">GitHub</a>
 </div>
 <div id="code-nv" class="cell-code" data-lines="2">
 <div class="code-wrap">
@@ -4122,7 +4122,7 @@ Cell: nv | 0.28s
 </div>
 </div>
 <div id="output-nv" class="cell-output">
-<div class="cell-stdout"><pre class="stdout-text">Thu Oct 30 15:51:43 2025       
+<div class="cell-stdout"><pre class="stdout-text">Fri Oct 31 20:00:25 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -4131,7 +4131,7 @@ Cell: nv | 0.28s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   27C    P8             22W /  350W |       0MiB /  46068MiB |      0%      Default |
+| N/A   33C    P0             79W /  350W |       0MiB /  46068MiB |     11%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -4153,13 +4153,13 @@ Cell: nv | 0.28s
 <span class="collapse-indicators">
 <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span> 
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
- <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
+ <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> | 
-Cell: benchmark | 32.46s
+Cell: benchmark | 3.68s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
-<a href="https://github.com/huggingface/kernels-uvnotes/blob/main/causal_conv1d/impls/torch_causal_conv1d.md" target="_blank" class="github-btn">GitHub</a>
+<a href="https://github.com/huggingface/kernels-benchmarks/blob/main/causal_conv1d/impls/torch_causal_conv1d.md" target="_blank" class="github-btn">GitHub</a>
 </div>
 <div id="code-benchmark" class="cell-code" data-lines="40">
 <div class="code-wrap">
@@ -4217,29 +4217,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     467.230us      2421.38%     467.230us     467.230us             1  
-                                            torch_eager        10.72%     231.062us        99.69%       2.148ms       2.148ms       0.000us         0.00%      21.632us      21.632us             1  
-                                               aten::to         0.58%      12.480us        78.88%       1.700ms     283.277us       0.000us         0.00%      14.336us       2.389us             6  
-                                         aten::_to_copy         2.05%      44.092us        78.31%       1.687ms     281.197us       0.000us         0.00%      14.336us       2.389us             6  
-                                            aten::copy_         3.07%      66.050us        73.46%       1.583ms     263.783us      12.000us        62.19%      14.336us       2.389us             6  
-                                           aten::conv1d         0.49%      10.600us         7.90%     170.164us      56.721us       0.000us         0.00%       7.296us       2.432us             3  
-                                      aten::convolution         0.77%      16.490us         7.41%     159.564us      53.188us       0.000us         0.00%       7.296us       2.432us             3  
-                                     aten::_convolution         1.64%      35.301us         6.64%     143.074us      47.691us       0.000us         0.00%       7.296us       2.432us             3  
-                                aten::_conv_depthwise2d         1.69%      36.381us         4.00%      86.271us      28.757us       7.296us        37.81%       7.296us       2.432us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.296us        37.81%       7.296us       2.432us             3  
-void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.304us        32.67%       6.304us       2.101us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.696us        29.52%       5.696us       1.899us             3  
-                                Activity Buffer Request        66.85%       1.440ms        66.85%       1.440ms       1.440ms       2.336us        12.11%       2.336us       2.336us             1  
-                                    aten::empty_strided         2.80%      60.390us         2.80%      60.390us      10.065us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         4.73%     101.823us         4.73%     101.823us      11.314us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         1.46%      31.451us         1.84%      39.731us       4.415us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.62%      13.289us         0.62%      13.289us       0.886us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.58%      12.560us         0.58%      12.560us       4.187us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.54%      11.740us         0.54%      11.740us       3.913us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.42%       8.963us         0.49%      10.602us       3.534us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     439.324us      2269.12%     439.324us     439.324us             1  
+                                            torch_eager        10.31%     220.478us        99.69%       2.131ms       2.131ms       0.000us         0.00%      21.729us      21.729us             1  
+                                               aten::to         0.50%      10.770us        79.87%       1.707ms     284.530us       0.000us         0.00%      14.369us       2.395us             6  
+                                         aten::_to_copy         1.71%      36.499us        79.36%       1.696ms     282.735us       0.000us         0.00%      14.369us       2.395us             6  
+                                            aten::copy_         2.77%      59.234us        75.21%       1.608ms     267.930us      12.001us        61.99%      14.369us       2.395us             6  
+                                           aten::conv1d         0.36%       7.590us         7.34%     156.883us      52.294us       0.000us         0.00%       7.360us       2.453us             3  
+                                      aten::convolution         0.66%      14.070us         6.98%     149.293us      49.764us       0.000us         0.00%       7.360us       2.453us             3  
+                                     aten::_convolution         1.51%      32.210us         6.33%     135.223us      45.074us       0.000us         0.00%       7.360us       2.453us             3  
+                                aten::_conv_depthwise2d         1.61%      34.371us         4.00%      85.463us      28.488us       7.360us        38.01%       7.360us       2.453us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.360us        38.01%       7.360us       2.453us             3  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.337us        32.73%       6.337us       2.112us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.664us        29.25%       5.664us       1.888us             3  
+                                Activity Buffer Request        69.37%       1.483ms        69.37%       1.483ms       1.483ms       2.368us        12.23%       2.368us       2.368us             1  
+                                    aten::empty_strided         2.45%      52.331us         2.45%      52.331us       8.722us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         4.26%      91.032us         4.26%      91.032us      10.115us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         1.32%      28.311us         1.71%      36.491us       4.055us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.64%      13.700us         0.64%      13.700us       0.913us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.60%      12.790us         0.60%      12.790us       4.263us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.59%      12.710us         0.59%      12.710us       4.237us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.31%       6.640us         0.38%       8.090us       2.697us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.155ms
-Self CUDA time total: 19.296us
+Self CPU time total: 2.138ms
+Self CUDA time total: 19.361us
 
 
 
@@ -4249,29 +4249,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     337.566us      1726.42%     337.566us     337.566us             1  
-                                            torch_eager         6.86%     130.161us        99.69%       1.893ms       1.893ms       0.000us         0.00%      21.665us      21.665us             1  
-                                               aten::to         0.32%       6.060us        85.13%       1.616ms     269.375us       0.000us         0.00%      13.729us       2.288us             6  
-                                         aten::_to_copy         1.27%      24.100us        84.81%       1.610ms     268.365us       0.000us         0.00%      13.729us       2.288us             6  
-                                            aten::copy_         2.69%      51.011us        81.95%       1.556ms     259.305us      11.617us        59.41%      13.729us       2.288us             6  
-                                           aten::conv1d         0.30%       5.740us         6.23%     118.253us      39.418us       0.000us         0.00%       7.936us       2.645us             3  
-                                      aten::convolution         0.52%       9.902us         5.93%     112.513us      37.504us       0.000us         0.00%       7.936us       2.645us             3  
-                                     aten::_convolution         1.21%      22.959us         5.40%     102.611us      34.204us       0.000us         0.00%       7.936us       2.645us             3  
-                                aten::_conv_depthwise2d         1.18%      22.461us         3.33%      63.161us      21.054us       7.936us        40.59%       7.936us       2.645us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.936us        40.59%       7.936us       2.645us             3  
-void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.080us        31.09%       6.080us       2.027us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.537us        28.32%       5.537us       1.846us             3  
-                                Activity Buffer Request        76.56%       1.454ms        76.56%       1.454ms       1.454ms       2.112us        10.80%       2.112us       2.112us             1  
-                                    aten::empty_strided         1.59%      30.260us         1.59%      30.260us       5.043us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         3.84%      72.993us         3.84%      72.993us       8.110us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.96%      18.220us         1.27%      24.051us       2.672us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.50%       9.451us         0.50%       9.451us       0.630us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.52%       9.960us         0.52%       9.960us       3.320us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.48%       9.030us         0.48%       9.030us       3.010us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.31%       5.890us         0.39%       7.340us       2.447us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     341.789us      1742.49%     341.789us     341.789us             1  
+                                            torch_eager         7.86%     151.082us        99.71%       1.916ms       1.916ms       0.000us         0.00%      21.695us      21.695us             1  
+                                               aten::to         0.35%       6.661us        83.96%       1.614ms     268.966us       0.000us         0.00%      13.695us       2.282us             6  
+                                         aten::_to_copy         1.29%      24.781us        83.61%       1.607ms     267.856us       0.000us         0.00%      13.695us       2.282us             6  
+                                            aten::copy_         2.59%      49.784us        80.72%       1.552ms     258.589us      11.615us        59.21%      13.695us       2.282us             6  
+                                           aten::conv1d         0.32%       6.220us         6.35%     122.113us      40.704us       0.000us         0.00%       8.000us       2.667us             3  
+                                      aten::convolution         0.53%      10.120us         6.03%     115.893us      38.631us       0.000us         0.00%       8.000us       2.667us             3  
+                                     aten::_convolution         1.20%      23.080us         5.50%     105.773us      35.258us       0.000us         0.00%       8.000us       2.667us             3  
+                                aten::_conv_depthwise2d         1.19%      22.952us         3.39%      65.123us      21.708us       8.000us        40.79%       8.000us       2.667us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       8.000us        40.79%       8.000us       2.667us             3  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.047us        30.83%       6.047us       2.016us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.568us        28.39%       5.568us       1.856us             3  
+                                Activity Buffer Request        75.54%       1.452ms        75.54%       1.452ms       1.452ms       2.080us        10.60%       2.080us       2.080us             1  
+                                    aten::empty_strided         1.60%      30.820us         1.60%      30.820us       5.137us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         3.74%      71.953us         3.74%      71.953us       7.995us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.98%      18.881us         1.29%      24.750us       2.750us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.50%       9.609us         0.50%       9.609us       0.641us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.56%      10.750us         0.56%      10.750us       3.583us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.49%       9.339us         0.49%       9.339us       3.113us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.34%       6.630us         0.42%       8.000us       2.667us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.899ms
-Self CUDA time total: 19.553us
+Self CPU time total: 1.922ms
+Self CUDA time total: 19.615us
 
 
 
@@ -4281,29 +4281,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     333.726us      1795.19%     333.726us     333.726us             1  
-                                            torch_eager         6.76%     126.472us        99.71%       1.865ms       1.865ms       0.000us         0.00%      20.510us      20.510us             1  
-                                               aten::to         0.32%       5.970us        85.12%       1.592ms     265.378us       0.000us         0.00%      13.598us       2.266us             6  
-                                         aten::_to_copy         1.26%      23.561us        84.80%       1.586ms     264.383us       0.000us         0.00%      13.598us       2.266us             6  
-                                            aten::copy_         2.75%      51.371us        81.92%       1.532ms     255.399us      11.678us        62.82%      13.598us       2.266us             6  
-                                           aten::conv1d         0.31%       5.850us         6.37%     119.083us      39.694us       0.000us         0.00%       6.912us       2.304us             3  
-                                      aten::convolution         0.54%      10.170us         6.05%     113.233us      37.744us       0.000us         0.00%       6.912us       2.304us             3  
-                                     aten::_convolution         1.25%      23.320us         5.51%     103.063us      34.354us       0.000us         0.00%       6.912us       2.304us             3  
-                                aten::_conv_depthwise2d         1.20%      22.402us         3.41%      63.713us      21.238us       6.912us        37.18%       6.912us       2.304us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       6.912us        37.18%       6.912us       2.304us             3  
-void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.951us        32.01%       5.951us       1.984us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.727us        30.81%       5.727us       1.909us             3  
-                                Activity Buffer Request        76.63%       1.433ms        76.63%       1.433ms       1.433ms       1.920us        10.33%       1.920us       1.920us             1  
-                                    aten::empty_strided         1.62%      30.340us         1.62%      30.340us       5.057us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         3.76%      70.302us         3.76%      70.302us       7.811us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.94%      17.590us         1.23%      22.950us       2.550us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.48%       8.970us         0.48%       8.970us       0.598us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.54%      10.051us         0.54%      10.051us       3.350us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.46%       8.519us         0.46%       8.519us       2.840us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.32%       5.980us         0.39%       7.380us       2.460us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     343.328us      1837.45%     343.328us     343.328us             1  
+                                            torch_eager         7.88%     151.015us        99.69%       1.911ms       1.911ms       0.000us         0.00%      20.605us      20.605us             1  
+                                               aten::to         0.33%       6.409us        84.02%       1.611ms     268.468us       0.000us         0.00%      13.662us       2.277us             6  
+                                         aten::_to_copy         1.32%      25.354us        83.68%       1.604ms     267.400us       0.000us         0.00%      13.662us       2.277us             6  
+                                            aten::copy_         2.65%      50.770us        80.80%       1.549ms     258.170us      11.742us        62.84%      13.662us       2.277us             6  
+                                           aten::conv1d         0.33%       6.290us         6.34%     121.483us      40.494us       0.000us         0.00%       6.943us       2.314us             3  
+                                      aten::convolution         0.54%      10.430us         6.01%     115.193us      38.398us       0.000us         0.00%       6.943us       2.314us             3  
+                                     aten::_convolution         1.17%      22.439us         5.46%     104.763us      34.921us       0.000us         0.00%       6.943us       2.314us             3  
+                                aten::_conv_depthwise2d         1.17%      22.412us         3.43%      65.843us      21.948us       6.943us        37.16%       6.943us       2.314us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       6.943us        37.16%       6.943us       2.314us             3  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.982us        32.01%       5.982us       1.994us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.760us        30.83%       5.760us       1.920us             3  
+                                Activity Buffer Request        75.50%       1.448ms        75.50%       1.448ms       1.448ms       1.920us        10.28%       1.920us       1.920us             1  
+                                    aten::empty_strided         1.57%      30.029us         1.57%      30.029us       5.005us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         3.90%      74.680us         3.90%      74.680us       8.298us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.93%      17.782us         1.21%      23.252us       2.584us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.48%       9.281us         0.48%       9.281us       0.619us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.57%      10.910us         0.57%      10.910us       3.637us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.44%       8.531us         0.44%       8.531us       2.844us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.32%       6.170us         0.39%       7.570us       2.523us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.871ms
-Self CUDA time total: 18.590us
+Self CPU time total: 1.917ms
+Self CUDA time total: 18.685us
 
 
 
@@ -4313,29 +4313,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     339.229us      1732.17%     339.229us     339.229us             1  
-                                            torch_eager         6.09%     126.194us        99.75%       2.066ms       2.066ms       0.000us         0.00%      21.729us      21.729us             1  
-                                               aten::to         0.29%       6.100us        86.58%       1.793ms     298.900us       0.000us         0.00%      14.018us       2.336us             6  
-                                         aten::_to_copy         1.16%      23.990us        86.28%       1.787ms     297.883us       0.000us         0.00%      14.018us       2.336us             6  
-                                            aten::copy_         2.58%      53.448us        83.67%       1.733ms     288.850us      11.873us        60.63%      14.018us       2.336us             6  
-                                           aten::conv1d         0.32%       6.580us         5.73%     118.763us      39.588us       0.000us         0.00%       7.711us       2.570us             3  
-                                      aten::convolution         0.48%       9.870us         5.42%     112.183us      37.394us       0.000us         0.00%       7.711us       2.570us             3  
-                                     aten::_convolution         1.09%      22.580us         4.94%     102.313us      34.104us       0.000us         0.00%       7.711us       2.570us             3  
-                                aten::_conv_depthwise2d         1.08%      22.411us         3.09%      64.033us      21.344us       7.711us        39.37%       7.711us       2.570us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.711us        39.37%       7.711us       2.570us             3  
-void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.145us        31.38%       6.145us       2.048us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.728us        29.25%       5.728us       1.909us             3  
-                                Activity Buffer Request        69.66%       1.443ms        69.66%       1.443ms       1.443ms       2.145us        10.95%       2.145us       2.145us             1  
-                                    aten::empty_strided         1.46%      30.210us         1.46%      30.210us       5.035us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        12.49%     258.686us        12.49%     258.686us      28.743us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.87%      18.050us         1.12%      23.200us       2.578us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.42%       8.720us         0.42%       8.720us       0.581us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.49%      10.140us         0.49%      10.140us       3.380us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.46%       9.442us         0.46%       9.442us       3.147us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.28%       5.830us         0.35%       7.220us       2.407us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     340.280us      1734.88%     340.280us     340.280us             1  
+                                            torch_eager         6.89%     141.563us        99.72%       2.049ms       2.049ms       0.000us         0.00%      21.726us      21.726us             1  
+                                               aten::to         0.30%       6.132us        85.38%       1.755ms     292.424us       0.000us         0.00%      13.982us       2.330us             6  
+                                         aten::_to_copy         1.19%      24.439us        85.08%       1.748ms     291.402us       0.000us         0.00%      13.982us       2.330us             6  
+                                            aten::copy_         2.50%      51.302us        82.39%       1.693ms     282.182us      11.870us        60.52%      13.982us       2.330us             6  
+                                           aten::conv1d         0.29%       5.930us         5.97%     122.723us      40.908us       0.000us         0.00%       7.744us       2.581us             3  
+                                      aten::convolution         0.50%      10.300us         5.68%     116.793us      38.931us       0.000us         0.00%       7.744us       2.581us             3  
+                                     aten::_convolution         1.17%      23.960us         5.18%     106.493us      35.498us       0.000us         0.00%       7.744us       2.581us             3  
+                                aten::_conv_depthwise2d         1.08%      22.141us         3.19%      65.452us      21.817us       7.744us        39.48%       7.744us       2.581us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.744us        39.48%       7.744us       2.581us             3  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.143us        31.32%       6.143us       2.048us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.727us        29.20%       5.727us       1.909us             3  
+                                Activity Buffer Request        70.00%       1.438ms        70.00%       1.438ms       1.438ms       2.112us        10.77%       2.112us       2.112us             1  
+                                    aten::empty_strided         1.50%      30.881us         1.50%      30.881us       5.147us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        11.01%     226.194us        11.01%     226.194us      25.133us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.89%      18.302us         1.19%      24.432us       2.715us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.49%       9.981us         0.49%       9.981us       0.665us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.55%      11.260us         0.55%      11.260us       3.753us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.45%       9.171us         0.45%       9.171us       3.057us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.32%       6.620us         0.39%       8.030us       2.677us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.071ms
-Self CUDA time total: 19.584us
+Self CPU time total: 2.055ms
+Self CUDA time total: 19.614us
 
 
 
@@ -4345,29 +4345,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     342.208us      1399.74%     342.208us     342.208us             1  
-                                            torch_eager         6.21%     125.160us        99.74%       2.012ms       2.012ms       0.000us         0.00%      26.720us      26.720us             1  
-                                               aten::to         0.29%       5.910us        86.35%       1.742ms     290.270us       0.000us         0.00%      15.168us       2.528us             6  
-                                         aten::_to_copy         1.25%      25.122us        86.06%       1.736ms     289.285us       0.000us         0.00%      15.168us       2.528us             6  
-                                            aten::copy_         2.93%      59.190us        83.27%       1.679ms     279.905us      12.896us        52.75%      15.168us       2.528us             6  
-                                           aten::conv1d         0.28%       5.620us         5.81%     117.132us      39.044us       0.000us         0.00%      11.552us       3.851us             3  
-                                      aten::convolution         0.49%       9.910us         5.53%     111.512us      37.171us       0.000us         0.00%      11.552us       3.851us             3  
-                                     aten::_convolution         1.15%      23.280us         5.04%     101.602us      33.867us       0.000us         0.00%      11.552us       3.851us             3  
-                                aten::_conv_depthwise2d         1.09%      21.990us         3.08%      62.201us      20.734us      11.552us        47.25%      11.552us       3.851us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      11.552us        47.25%      11.552us       3.851us             3  
-void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.624us        27.09%       6.624us       2.208us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.272us        25.65%       6.272us       2.091us             3  
-                                Activity Buffer Request        71.09%       1.434ms        71.09%       1.434ms       1.434ms       2.272us         9.29%       2.272us       2.272us             1  
-                                    aten::empty_strided         1.55%      31.162us         1.55%      31.162us       5.194us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        10.29%     207.543us        10.29%     207.543us      23.060us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.90%      18.220us         1.17%      23.681us       2.631us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.44%       8.971us         0.44%       8.971us       0.598us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.49%       9.951us         0.49%       9.951us       3.317us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.46%       9.230us         0.46%       9.230us       3.077us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.29%       5.780us         0.35%       7.150us       2.383us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     379.964us      1548.03%     379.964us     379.964us             1  
+                                            torch_eager         7.69%     160.944us        99.76%       2.089ms       2.089ms       0.000us         0.00%      26.817us      26.817us             1  
+                                               aten::to         0.33%       7.000us        83.76%       1.754ms     292.349us       0.000us         0.00%      15.265us       2.544us             6  
+                                         aten::_to_copy         1.23%      25.779us        83.43%       1.747ms     291.183us       0.000us         0.00%      15.265us       2.544us             6  
+                                            aten::copy_         2.49%      52.100us        80.65%       1.689ms     281.484us      12.993us        52.94%      15.265us       2.544us             6  
+                                           aten::conv1d         0.31%       6.410us         6.85%     143.364us      47.788us       0.000us         0.00%      11.552us       3.851us             3  
+                                      aten::convolution         1.48%      31.021us         6.54%     136.954us      45.651us       0.000us         0.00%      11.552us       3.851us             3  
+                                     aten::_convolution         1.13%      23.621us         5.06%     105.933us      35.311us       0.000us         0.00%      11.552us       3.851us             3  
+                                aten::_conv_depthwise2d         1.06%      22.209us         3.13%      65.632us      21.877us      11.552us        47.06%      11.552us       3.851us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      11.552us        47.06%      11.552us       3.851us             3  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.625us        26.99%       6.625us       2.208us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.368us        25.94%       6.368us       2.123us             3  
+                                Activity Buffer Request        68.76%       1.440ms        68.76%       1.440ms       1.440ms       2.272us         9.26%       2.272us       2.272us             1  
+                                    aten::empty_strided         1.55%      32.413us         1.55%      32.413us       5.402us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.50%     219.817us        10.50%     219.817us      24.424us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.87%      18.301us         1.15%      24.061us       2.673us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.50%      10.530us         0.50%      10.530us       0.702us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.50%      10.490us         0.50%      10.490us       3.497us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.47%       9.872us         0.47%       9.872us       3.291us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.30%       6.220us         0.37%       7.740us       2.580us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.017ms
-Self CUDA time total: 24.448us
+Self CPU time total: 2.094ms
+Self CUDA time total: 24.545us
 
 
 
@@ -4377,29 +4377,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     360.702us      1391.60%     360.702us     360.702us             1  
-                                            torch_eager         7.02%     142.940us        99.74%       2.030ms       2.030ms       0.000us         0.00%      28.128us      28.128us             1  
-                                               aten::to         0.30%       6.030us        85.23%       1.734ms     289.050us       0.000us         0.00%      15.136us       2.523us             6  
-                                         aten::_to_copy         1.18%      23.913us        84.93%       1.728ms     288.045us       0.000us         0.00%      15.136us       2.523us             6  
-                                            aten::copy_         2.60%      52.858us        82.24%       1.673ms     278.911us      12.928us        49.88%      15.136us       2.523us             6  
-                                           aten::conv1d         0.29%       5.931us         6.05%     123.062us      41.021us       0.000us         0.00%      12.992us       4.331us             3  
-                                      aten::convolution         0.49%      10.049us         5.76%     117.131us      39.044us       0.000us         0.00%      12.992us       4.331us             3  
-                                     aten::_convolution         1.15%      23.381us         5.26%     107.082us      35.694us       0.000us         0.00%      12.992us       4.331us             3  
-                                aten::_conv_depthwise2d         1.11%      22.652us         3.33%      67.801us      22.600us      12.992us        50.12%      12.992us       4.331us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      12.992us        50.12%      12.992us       4.331us             3  
-void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.592us        25.43%       6.592us       2.197us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.336us        24.44%       6.336us       2.112us             3  
-                                Activity Buffer Request        70.88%       1.442ms        70.88%       1.442ms       1.442ms       2.208us         8.52%       2.208us       2.208us             1  
-                                    aten::empty_strided         1.52%      30.891us         1.52%      30.891us       5.148us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        10.00%     203.394us        10.00%     203.394us      22.599us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.92%      18.741us         1.20%      24.361us       2.707us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.46%       9.330us         0.46%       9.330us       0.622us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.51%      10.450us         0.51%      10.450us       3.483us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.47%       9.490us         0.47%       9.490us       3.163us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.29%       5.900us         0.36%       7.380us       2.460us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     351.133us      1341.43%     351.133us     351.133us             1  
+                                            torch_eager         7.55%     157.812us        99.73%       2.084ms       2.084ms       0.000us         0.00%      28.416us      28.416us             1  
+                                               aten::to         0.31%       6.571us        84.80%       1.772ms     295.318us       0.000us         0.00%      15.264us       2.544us             6  
+                                         aten::_to_copy         1.22%      25.450us        84.49%       1.765ms     294.223us       0.000us         0.00%      15.264us       2.544us             6  
+                                            aten::copy_         2.31%      48.301us        81.82%       1.710ms     284.947us      13.024us        49.76%      15.264us       2.544us             6  
+                                           aten::conv1d         0.32%       6.640us         5.96%     124.543us      41.514us       0.000us         0.00%      13.152us       4.384us             3  
+                                      aten::convolution         0.50%      10.360us         5.64%     117.903us      39.301us       0.000us         0.00%      13.152us       4.384us             3  
+                                     aten::_convolution         1.16%      24.330us         5.15%     107.543us      35.848us       0.000us         0.00%      13.152us       4.384us             3  
+                                aten::_conv_depthwise2d         1.06%      22.241us         3.14%      65.623us      21.874us      13.152us        50.24%      13.152us       4.384us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      13.152us        50.24%      13.152us       4.384us             3  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.656us        25.43%       6.656us       2.219us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.368us        24.33%       6.368us       2.123us             3  
+                                Activity Buffer Request        70.10%       1.465ms        70.10%       1.465ms       1.465ms       2.240us         8.56%       2.240us       2.240us             1  
+                                    aten::empty_strided         1.45%      30.202us         1.45%      30.202us       5.034us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.51%     219.677us        10.51%     219.677us      24.409us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.90%      18.881us         1.17%      24.421us       2.713us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.46%       9.580us         0.46%       9.580us       0.639us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.55%      11.471us         0.55%      11.471us       3.824us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.43%       8.890us         0.43%       8.890us       2.963us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.33%       6.950us         0.40%       8.400us       2.800us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.035ms
-Self CUDA time total: 25.920us
+Self CPU time total: 2.089ms
+Self CUDA time total: 26.176us
 
 
 
@@ -4409,29 +4409,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     369.628us       962.57%     369.628us     369.628us             1  
-                                            torch_eager         7.12%     161.009us        99.76%       2.255ms       2.255ms       0.000us         0.00%      40.960us      40.960us             1  
-                                           aten::conv1d         0.32%       7.222us         5.82%     131.613us      43.871us       0.000us         0.00%      22.528us       7.509us             3  
-                                      aten::convolution         0.54%      12.229us         5.50%     124.391us      41.464us       0.000us         0.00%      22.528us       7.509us             3  
-                                     aten::_convolution         1.15%      26.031us         4.96%     112.162us      37.387us       0.000us         0.00%      22.528us       7.509us             3  
-                                aten::_conv_depthwise2d         1.09%      24.630us         3.00%      67.820us      22.607us      22.528us        58.67%      22.528us       7.509us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      22.528us        58.67%      22.528us       7.509us             3  
-                                               aten::to         0.34%       7.671us        85.42%       1.931ms     321.787us       0.000us         0.00%      18.432us       3.072us             6  
-                                         aten::_to_copy         1.41%      31.890us        85.08%       1.923ms     320.509us       0.000us         0.00%      18.432us       3.072us             6  
-                                            aten::copy_         2.64%      59.711us        82.13%       1.856ms     309.384us      15.872us        41.33%      18.432us       3.072us             6  
-void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.544us        22.25%       8.544us       2.848us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.328us        19.08%       7.328us       2.443us             3  
-                                Activity Buffer Request        64.20%       1.451ms        64.20%       1.451ms       1.451ms       2.560us         6.67%       2.560us       2.560us             1  
-                                    aten::empty_strided         1.54%      34.861us         1.54%      34.861us       5.810us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        16.32%     368.786us        16.32%     368.786us      40.976us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.93%      20.991us         1.15%      26.100us       2.900us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.41%       9.319us         0.41%       9.319us       0.621us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.44%       9.850us         0.44%       9.850us       3.283us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.44%       9.970us         0.44%       9.970us       3.323us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.31%       7.041us         0.38%       8.701us       2.900us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     349.627us       908.24%     349.627us     349.627us             1  
+                                            torch_eager         7.45%     152.992us        99.76%       2.049ms       2.049ms       0.000us         0.00%      41.086us      41.086us             1  
+                                           aten::conv1d         0.32%       6.640us         6.06%     124.413us      41.471us       0.000us         0.00%      22.561us       7.520us             3  
+                                      aten::convolution         0.50%      10.370us         5.73%     117.773us      39.258us       0.000us         0.00%      22.561us       7.520us             3  
+                                     aten::_convolution         1.14%      23.411us         5.23%     107.403us      35.801us       0.000us         0.00%      22.561us       7.520us             3  
+                                aten::_conv_depthwise2d         1.15%      23.650us         3.29%      67.532us      22.511us      22.561us        58.61%      22.561us       7.520us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      22.561us        58.61%      22.561us       7.520us             3  
+                                               aten::to         0.33%       6.780us        84.82%       1.743ms     290.446us       0.000us         0.00%      18.525us       3.087us             6  
+                                         aten::_to_copy         1.29%      26.502us        84.49%       1.736ms     289.316us       0.000us         0.00%      18.525us       3.087us             6  
+                                            aten::copy_         2.40%      49.251us        81.74%       1.679ms     279.869us      15.934us        41.39%      18.525us       3.087us             6  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.543us        22.19%       8.543us       2.848us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.391us        19.20%       7.391us       2.464us             3  
+                                Activity Buffer Request        69.84%       1.435ms        69.84%       1.435ms       1.435ms       2.591us         6.73%       2.591us       2.591us             1  
+                                    aten::empty_strided         1.47%      30.182us         1.47%      30.182us       5.030us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.64%     218.664us        10.64%     218.664us      24.296us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.89%      18.281us         1.17%      24.011us       2.668us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.47%       9.739us         0.47%       9.739us       0.649us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.53%      10.991us         0.53%      10.991us       3.664us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.46%       9.421us         0.46%       9.421us       3.140us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.29%       5.970us         0.36%       7.320us       2.440us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.260ms
-Self CUDA time total: 38.400us
+Self CPU time total: 2.054ms
+Self CUDA time total: 38.495us
 
 
 
@@ -4441,29 +4441,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     343.007us       838.09%     343.007us     343.007us             1  
-                                            torch_eager         6.47%     141.163us        99.73%       2.175ms       2.175ms       0.000us         0.00%      43.487us      43.487us             1  
-                                           aten::conv1d         0.27%       5.870us         5.52%     120.313us      40.104us       0.000us         0.00%      25.376us       8.459us             3  
-                                      aten::convolution         0.46%      10.120us         5.25%     114.443us      38.148us       0.000us         0.00%      25.376us       8.459us             3  
-                                     aten::_convolution         1.12%      24.490us         4.78%     104.323us      34.774us       0.000us         0.00%      25.376us       8.459us             3  
-                                aten::_conv_depthwise2d         1.00%      21.702us         2.89%      62.963us      20.988us      25.376us        62.00%      25.376us       8.459us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      25.376us        62.00%      25.376us       8.459us             3  
-                                               aten::to         0.28%       6.129us        86.46%       1.885ms     314.232us       0.000us         0.00%      18.111us       3.018us             6  
-                                         aten::_to_copy         1.13%      24.640us        86.18%       1.879ms     313.211us       0.000us         0.00%      18.111us       3.018us             6  
-                                            aten::copy_         2.51%      54.672us        83.58%       1.823ms     303.754us      15.551us        38.00%      18.111us       3.018us             6  
-void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.224us        20.09%       8.224us       2.741us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.327us        17.90%       7.327us       2.442us             3  
-                                Activity Buffer Request        66.59%       1.452ms        66.59%       1.452ms       1.452ms       2.560us         6.26%       2.560us       2.560us             1  
-                                    aten::empty_strided         1.47%      32.100us         1.47%      32.100us       5.350us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        15.50%     338.007us        15.50%     338.007us      37.556us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.84%      18.320us         1.10%      24.070us       2.674us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.43%       9.420us         0.43%       9.420us       0.628us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.46%      10.080us         0.46%      10.080us       3.360us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.42%       9.080us         0.42%       9.080us       3.027us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.27%       5.960us         0.34%       7.390us       2.463us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     345.054us       837.81%     345.054us     345.054us             1  
+                                            torch_eager         7.39%     151.695us        99.75%       2.049ms       2.049ms       0.000us         0.00%      43.810us      43.810us             1  
+                                           aten::conv1d         0.32%       6.620us         6.03%     123.883us      41.294us       0.000us         0.00%      25.375us       8.458us             3  
+                                      aten::convolution         0.50%      10.320us         5.71%     117.263us      39.088us       0.000us         0.00%      25.375us       8.458us             3  
+                                     aten::_convolution         1.20%      24.592us         5.21%     106.943us      35.648us       0.000us         0.00%      25.375us       8.458us             3  
+                                aten::_conv_depthwise2d         1.13%      23.150us         3.19%      65.451us      21.817us      25.375us        61.61%      25.375us       8.458us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      25.375us        61.61%      25.375us       8.458us             3  
+                                               aten::to         0.31%       6.440us        84.93%       1.744ms     290.716us       0.000us         0.00%      18.435us       3.072us             6  
+                                         aten::_to_copy         1.24%      25.501us        84.61%       1.738ms     289.642us       0.000us         0.00%      18.435us       3.072us             6  
+                                            aten::copy_         2.41%      49.431us        81.91%       1.682ms     280.380us      15.810us        38.39%      18.435us       3.072us             6  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.386us        20.36%       8.386us       2.795us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.424us        18.03%       7.424us       2.475us             3  
+                                Activity Buffer Request        70.32%       1.444ms        70.32%       1.444ms       1.444ms       2.625us         6.37%       2.625us       2.625us             1  
+                                    aten::empty_strided         1.46%      30.070us         1.46%      30.070us       5.012us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.28%     211.144us        10.28%     211.144us      23.460us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.92%      18.949us         1.19%      24.411us       2.712us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.45%       9.313us         0.45%       9.313us       0.621us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.52%      10.601us         0.52%      10.601us       3.534us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.44%       9.110us         0.44%       9.110us       3.037us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.29%       5.930us         0.36%       7.410us       2.470us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.181ms
-Self CUDA time total: 40.927us
+Self CPU time total: 2.054ms
+Self CUDA time total: 41.185us
 
 
 
@@ -4473,29 +4473,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     367.004us       357.73%     367.004us     367.004us             1  
-                                            torch_eager         6.17%     126.763us        99.73%       2.049ms       2.049ms       0.000us         0.00%     108.512us     108.512us             1  
-                                           aten::conv1d         0.28%       5.761us         5.81%     119.372us      39.791us       0.000us         0.00%      70.432us      23.477us             3  
-                                      aten::convolution         0.48%       9.820us         5.53%     113.611us      37.870us       0.000us         0.00%      70.432us      23.477us             3  
-                                     aten::_convolution         1.11%      22.788us         5.05%     103.791us      34.597us       0.000us         0.00%      70.432us      23.477us             3  
-                                aten::_conv_depthwise2d         1.12%      22.910us         3.14%      64.601us      21.534us      70.432us        68.65%      70.432us      23.477us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      70.432us        68.65%      70.432us      23.477us             3  
-                                               aten::to         0.30%       6.130us        86.37%       1.774ms     295.680us       0.000us         0.00%      38.080us       6.347us             6  
-                                         aten::_to_copy         2.18%      44.819us        86.07%       1.768ms     294.658us       0.000us         0.00%      38.080us       6.347us             6  
-                                            aten::copy_         2.56%      52.622us        82.32%       1.691ms     281.815us      32.160us        31.35%      38.080us       6.347us             6  
-void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      17.504us        17.06%      17.504us       5.835us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      14.656us        14.29%      14.656us       4.885us             3  
-                                Activity Buffer Request        69.77%       1.433ms        69.77%       1.433ms       1.433ms       5.920us         5.77%       5.920us       5.920us             1  
-                                    aten::empty_strided         1.57%      32.241us         1.57%      32.241us       5.373us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        11.08%     227.645us        11.08%     227.645us      25.294us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.87%      17.849us         1.12%      23.070us       2.563us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.44%       9.030us         0.44%       9.030us       0.602us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.49%      10.050us         0.49%      10.050us       3.350us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.44%       9.040us         0.44%       9.040us       3.013us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.30%       6.163us         0.38%       7.782us       2.594us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     348.348us       338.39%     348.348us     348.348us             1  
+                                            torch_eager         7.21%     148.863us        99.73%       2.059ms       2.059ms       0.000us         0.00%     108.926us     108.926us             1  
+                                           aten::conv1d         0.31%       6.430us         5.95%     122.893us      40.964us       0.000us         0.00%      70.592us      23.531us             3  
+                                      aten::convolution         0.50%      10.290us         5.64%     116.463us      38.821us       0.000us         0.00%      70.592us      23.531us             3  
+                                     aten::_convolution         1.17%      24.211us         5.14%     106.173us      35.391us       0.000us         0.00%      70.592us      23.531us             3  
+                                aten::_conv_depthwise2d         1.12%      23.052us         3.16%      65.282us      21.761us      70.592us        68.57%      70.592us      23.531us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      70.592us        68.57%      70.592us      23.531us             3  
+                                               aten::to         0.31%       6.372us        85.15%       1.758ms     292.949us       0.000us         0.00%      38.334us       6.389us             6  
+                                         aten::_to_copy         1.20%      24.680us        84.84%       1.751ms     291.887us       0.000us         0.00%      38.334us       6.389us             6  
+                                            aten::copy_         2.47%      51.072us        82.20%       1.697ms     282.787us      32.350us        31.43%      38.334us       6.389us             6  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      17.695us        17.19%      17.695us       5.898us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      14.655us        14.24%      14.655us       4.885us             3  
+                                Activity Buffer Request        70.59%       1.457ms        70.59%       1.457ms       1.457ms       5.984us         5.81%       5.984us       5.984us             1  
+                                    aten::empty_strided         1.45%      29.921us         1.45%      29.921us       4.987us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.23%     211.264us        10.23%     211.264us      23.474us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.89%      18.462us         1.17%      24.111us       2.679us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.47%       9.709us         0.47%       9.709us       0.647us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.47%       9.780us         0.47%       9.780us       3.260us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.47%       9.740us         0.47%       9.740us       3.247us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.28%       5.880us         0.35%       7.260us       2.420us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.054ms
-Self CUDA time total: 102.592us
+Self CPU time total: 2.064ms
+Self CUDA time total: 102.942us
 
 
 
@@ -4505,29 +4505,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     336.959us       299.49%     336.959us     336.959us             1  
-                                            torch_eager         6.25%     125.522us        99.75%       2.004ms       2.004ms       0.000us         0.00%     118.493us     118.493us             1  
-                                           aten::conv1d         0.38%       7.700us         5.98%     120.223us      40.074us       0.000us         0.00%      80.479us      26.826us             3  
-                                      aten::convolution         0.49%       9.780us         5.60%     112.523us      37.508us       0.000us         0.00%      80.479us      26.826us             3  
-                                     aten::_convolution         1.13%      22.669us         5.11%     102.743us      34.248us       0.000us         0.00%      80.479us      26.826us             3  
-                                aten::_conv_depthwise2d         1.12%      22.452us         3.19%      64.073us      21.358us      80.479us        71.53%      80.479us      26.826us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      80.479us        71.53%      80.479us      26.826us             3  
-                                               aten::to         0.29%       5.910us        86.14%       1.731ms     288.442us       0.000us         0.00%      38.014us       6.336us             6  
-                                         aten::_to_copy         1.19%      24.001us        85.85%       1.725ms     287.457us       0.000us         0.00%      38.014us       6.336us             6  
-                                            aten::copy_         2.56%      51.481us        83.17%       1.671ms     278.473us      32.031us        28.47%      38.014us       6.336us             6  
-void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      17.567us        15.61%      17.567us       5.856us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      14.464us        12.86%      14.464us       4.821us             3  
-                                Activity Buffer Request        71.72%       1.441ms        71.72%       1.441ms       1.441ms       5.983us         5.32%       5.983us       5.983us             1  
-                                    aten::empty_strided         1.49%      29.901us         1.49%      29.901us       4.983us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        10.00%     200.814us        10.00%     200.814us      22.313us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.89%      17.861us         1.15%      23.111us       2.568us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.45%       8.970us         0.45%       8.970us       0.598us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.50%      10.050us         0.50%      10.050us       3.350us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.46%       9.169us         0.46%       9.169us       3.056us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.30%       6.030us         0.38%       7.560us       2.520us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     344.181us       304.53%     344.181us     344.181us             1  
+                                            torch_eager        14.98%     124.863us        99.35%     828.302us     828.302us       0.000us         0.00%     119.036us     119.036us             1  
+                                           aten::conv1d         0.70%       5.870us        14.55%     121.343us      40.448us       0.000us         0.00%      80.669us      26.890us             3  
+                                      aten::convolution         1.17%       9.720us        13.85%     115.473us      38.491us       0.000us         0.00%      80.669us      26.890us             3  
+                                     aten::_convolution         2.96%      24.691us        12.68%     105.753us      35.251us       0.000us         0.00%      80.669us      26.890us             3  
+                                aten::_conv_depthwise2d         2.65%      22.121us         7.65%      63.762us      21.254us      80.669us        71.38%      80.669us      26.890us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      80.669us        71.38%      80.669us      26.890us             3  
+                                               aten::to         0.77%       6.429us        66.53%     554.705us      92.451us       0.000us         0.00%      38.367us       6.394us             6  
+                                         aten::_to_copy         3.01%      25.101us        65.76%     548.276us      91.379us       0.000us         0.00%      38.367us       6.394us             6  
+                                            aten::copy_         6.16%      51.352us        59.05%     492.343us      82.057us      32.351us        28.62%      38.367us       6.394us             6  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      17.696us        15.66%      17.696us       5.899us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      14.655us        12.97%      14.655us       4.885us             3  
+                                Activity Buffer Request        28.81%     240.197us        28.81%     240.197us     240.197us       6.016us         5.32%       6.016us       6.016us             1  
+                                    aten::empty_strided         3.70%      30.832us         3.70%      30.832us       5.139us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        26.65%     222.174us        26.65%     222.174us      24.686us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.09%      17.401us         2.70%      22.541us       2.505us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.05%       8.790us         1.05%       8.790us       0.586us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.34%      11.151us         1.34%      11.151us       3.717us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.09%       9.110us         1.09%       9.110us       3.037us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.89%       7.450us         1.05%       8.790us       2.930us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.009ms
-Self CUDA time total: 112.510us
+Self CPU time total: 833.752us
+Self CUDA time total: 113.020us
 
 
 
@@ -4537,29 +4537,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         5.98%     122.945us        97.76%       2.011ms       2.011ms       0.000us         0.00%     433.437us     433.437us             1  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     423.709us       107.83%     423.709us     423.709us             1  
-                                           aten::conv1d         0.28%       5.760us         5.73%     117.851us      39.284us       0.000us         0.00%     250.941us      83.647us             3  
-                                      aten::convolution         0.48%       9.830us         5.45%     112.091us      37.364us       0.000us         0.00%     250.941us      83.647us             3  
-                                     aten::_convolution         1.12%      23.111us         4.97%     102.261us      34.087us       0.000us         0.00%     250.941us      83.647us             3  
-                                aten::_conv_depthwise2d         1.03%      21.200us         3.03%      62.360us      20.787us     250.941us        63.86%     250.941us      83.647us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     250.941us        63.86%     250.941us      83.647us             3  
-                                               aten::to         0.28%       5.851us        84.70%       1.742ms     290.313us       0.000us         0.00%     182.496us      30.416us             6  
-                                         aten::_to_copy         1.16%      23.919us        84.41%       1.736ms     289.338us       0.000us         0.00%     182.496us      30.416us             6  
-                                            aten::copy_         2.53%      51.981us        81.78%       1.682ms     280.333us     142.016us        36.14%     182.496us      30.416us             6  
-void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     101.952us        25.94%     101.952us      33.984us             3  
-                                Activity Buffer Request        70.64%       1.453ms        70.64%       1.453ms       1.453ms      40.480us        10.30%      40.480us      40.480us             1  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.064us        10.20%      40.064us      13.355us             3  
-                                    aten::empty_strided         1.46%      30.112us         1.46%      30.112us       5.019us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         9.67%     198.853us         9.67%     198.853us      22.095us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.91%      18.669us         1.18%      24.270us       2.697us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.44%       9.151us         0.44%       9.151us       0.610us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.48%       9.870us         0.48%       9.870us       3.290us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.47%       9.710us         0.47%       9.710us       3.237us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.29%       5.960us         0.36%       7.350us       2.450us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager        14.21%     122.455us        95.83%     825.681us     825.681us       0.000us         0.00%     433.339us     433.339us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     419.771us       106.59%     419.771us     419.771us             1  
+                                           aten::conv1d         0.75%       6.429us        14.10%     121.522us      40.507us       0.000us         0.00%     251.453us      83.818us             3  
+                                      aten::convolution         1.15%       9.929us        13.36%     115.093us      38.364us       0.000us         0.00%     251.453us      83.818us             3  
+                                     aten::_convolution         2.67%      23.042us        12.21%     105.164us      35.055us       0.000us         0.00%     251.453us      83.818us             3  
+                                aten::_conv_depthwise2d         2.60%      22.440us         7.52%      64.810us      21.603us     251.453us        63.85%     251.453us      83.818us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     251.453us        63.85%     251.453us      83.818us             3  
+                                               aten::to         0.70%       6.001us        64.14%     552.672us      92.112us       0.000us         0.00%     181.886us      30.314us             6  
+                                         aten::_to_copy         2.73%      23.540us        63.45%     546.671us      91.112us       0.000us         0.00%     181.886us      30.314us             6  
+                                            aten::copy_         5.94%      51.140us        57.36%     494.211us      82.368us     142.367us        36.15%     181.886us      30.314us             6  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     102.367us        25.99%     102.367us      34.122us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.000us        10.16%      40.000us      13.333us             3  
+                                Activity Buffer Request        29.04%     250.247us        29.04%     250.247us     250.247us      39.519us        10.03%      39.519us      39.519us             1  
+                                    aten::empty_strided         3.36%      28.920us         3.36%      28.920us       4.820us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        24.89%     214.494us        24.89%     214.494us      23.833us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         1.98%      17.062us         2.59%      22.273us       2.475us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.09%       9.391us         1.09%       9.391us       0.626us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.24%      10.660us         1.24%      10.660us       3.553us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.17%      10.040us         1.17%      10.040us       3.347us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.86%       7.370us         1.02%       8.800us       2.933us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.057ms
-Self CUDA time total: 392.957us
+Self CPU time total: 861.602us
+Self CUDA time total: 393.820us
 
 
 
@@ -4569,29 +4569,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         5.86%     122.119us        95.18%       1.984ms       1.984ms       0.000us         0.00%     485.373us     485.373us             1  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     475.549us       106.61%     475.549us     475.549us             1  
-                                           aten::conv1d         0.29%       6.020us         5.58%     116.291us      38.764us       0.000us         0.00%     298.429us      99.476us             3  
-                                      aten::convolution         0.46%       9.580us         5.29%     110.271us      36.757us       0.000us         0.00%     298.429us      99.476us             3  
-                                     aten::_convolution         1.07%      22.391us         4.83%     100.691us      33.564us       0.000us         0.00%     298.429us      99.476us             3  
-                                aten::_conv_depthwise2d         1.02%      21.160us         3.01%      62.730us      20.910us     298.429us        66.91%     298.429us      99.476us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     298.429us        66.91%     298.429us      99.476us             3  
-                                               aten::to         0.28%       5.929us        82.40%       1.718ms     286.300us       0.000us         0.00%     186.944us      31.157us             6  
-                                         aten::_to_copy         1.13%      23.472us        82.12%       1.712ms     285.312us       0.000us         0.00%     186.944us      31.157us             6  
-                                            aten::copy_         2.45%      51.061us        79.57%       1.659ms     276.443us     147.616us        33.09%     186.944us      31.157us             6  
-void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     107.712us        24.15%     107.712us      35.904us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      39.904us         8.95%      39.904us      13.301us             3  
-                                Activity Buffer Request        68.65%       1.431ms        68.65%       1.431ms       1.431ms      39.328us         8.82%      39.328us      39.328us             1  
-                                    aten::empty_strided         1.43%      29.742us         1.43%      29.742us       4.957us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         9.54%     198.903us         9.54%     198.903us      22.100us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.85%      17.731us         1.11%      23.210us       2.579us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.44%       9.210us         0.44%       9.210us       0.614us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.47%       9.850us         0.47%       9.850us       3.283us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.45%       9.320us         0.45%       9.320us       3.107us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.28%       5.850us         0.35%       7.270us       2.423us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager        15.32%     134.312us        91.67%     803.971us     803.971us       0.000us         0.00%     487.924us     487.924us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     476.501us       106.34%     476.501us     476.501us             1  
+                                           aten::conv1d         0.67%       5.860us        13.82%     121.173us      40.391us       0.000us         0.00%     299.161us      99.720us             3  
+                                      aten::convolution         1.17%      10.220us        13.15%     115.313us      38.438us       0.000us         0.00%     299.161us      99.720us             3  
+                                     aten::_convolution         2.67%      23.450us        11.98%     105.093us      35.031us       0.000us         0.00%     299.161us      99.720us             3  
+                                aten::_conv_depthwise2d         2.56%      22.451us         7.48%      65.623us      21.874us     299.161us        66.76%     299.161us      99.720us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     299.161us        66.76%     299.161us      99.720us             3  
+                                               aten::to         0.69%       6.051us        59.17%     518.906us      86.484us       0.000us         0.00%     188.763us      31.460us             6  
+                                         aten::_to_copy         2.71%      23.771us        58.48%     512.855us      85.476us       0.000us         0.00%     188.763us      31.460us             6  
+                                            aten::copy_         5.69%      49.880us        52.31%     458.742us      76.457us     148.924us        33.24%     188.763us      31.460us             6  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     108.861us        24.29%     108.861us      36.287us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.063us         8.94%      40.063us      13.354us             3  
+                                Activity Buffer Request        25.01%     219.366us        25.01%     219.366us     219.366us      39.839us         8.89%      39.839us      39.839us             1  
+                                    aten::empty_strided         3.46%      30.342us         3.46%      30.342us       5.057us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        24.34%     213.439us        24.34%     213.439us      23.715us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         1.98%      17.400us         2.59%      22.720us       2.524us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.09%       9.540us         1.09%       9.540us       0.636us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.14%      10.010us         1.14%      10.010us       3.337us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.05%       9.219us         1.05%       9.219us       3.073us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.66%       5.750us         0.82%       7.210us       2.403us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.085ms
-Self CUDA time total: 446.045us
+Self CPU time total: 876.983us
+Self CUDA time total: 448.085us
 
 
 
@@ -4601,29 +4601,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     323.833us      1729.88%     323.833us     323.833us             1  
-                                            torch_eager        14.51%     116.191us        99.37%     795.884us     795.884us       0.000us         0.00%      20.608us      20.608us             1  
-                                               aten::to         0.75%       6.009us        67.15%     537.870us      89.645us       0.000us         0.00%      13.376us       2.229us             6  
-                                         aten::_to_copy         2.93%      23.471us        66.40%     531.861us      88.644us       0.000us         0.00%      13.376us       2.229us             6  
-                                            aten::copy_         6.32%      50.599us        59.65%     477.769us      79.628us      11.488us        61.37%      13.376us       2.229us             6  
-                                           aten::conv1d         0.81%       6.510us        14.38%     115.173us      38.391us       0.000us         0.00%       7.232us       2.411us             3  
-                                      aten::convolution         1.28%      10.221us        13.57%     108.663us      36.221us       0.000us         0.00%       7.232us       2.411us             3  
-                                     aten::_convolution         2.73%      21.890us        12.29%      98.442us      32.814us       0.000us         0.00%       7.232us       2.411us             3  
-                                aten::_conv_depthwise2d         2.76%      22.080us         7.70%      61.700us      20.567us       7.232us        38.63%       7.232us       2.411us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.232us        38.63%       7.232us       2.411us             3  
-void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.888us        31.45%       5.888us       1.963us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.600us        29.91%       5.600us       1.867us             3  
-                                Activity Buffer Request        31.20%     249.924us        31.20%     249.924us     249.924us       1.888us        10.09%       1.888us       1.888us             1  
-                                    aten::empty_strided         3.82%      30.621us         3.82%      30.621us       5.103us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        24.75%     198.236us        24.75%     198.236us      22.026us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.09%      16.762us         2.71%      21.692us       2.410us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.04%       8.330us         1.04%       8.330us       0.555us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.15%       9.220us         1.15%       9.220us       3.073us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.17%       9.410us         1.17%       9.410us       3.137us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.72%       5.800us         0.89%       7.160us       2.387us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     338.392us      1804.85%     338.392us     338.392us             1  
+                                            torch_eager        18.33%     161.236us        99.35%     873.703us     873.703us       0.000us         0.00%      20.637us      20.637us             1  
+                                               aten::to         0.69%       6.070us        63.71%     560.224us      93.371us       0.000us         0.00%      13.406us       2.234us             6  
+                                         aten::_to_copy         2.78%      24.471us        63.02%     554.154us      92.359us       0.000us         0.00%      13.406us       2.234us             6  
+                                            aten::copy_         5.94%      52.212us        56.85%     499.953us      83.325us      11.518us        61.43%      13.406us       2.234us             6  
+                                           aten::conv1d         0.64%       5.659us        14.02%     123.282us      41.094us       0.000us         0.00%       7.231us       2.410us             3  
+                                      aten::convolution         1.14%       9.999us        13.38%     117.623us      39.208us       0.000us         0.00%       7.231us       2.410us             3  
+                                     aten::_convolution         2.72%      23.952us        12.24%     107.624us      35.875us       0.000us         0.00%       7.231us       2.410us             3  
+                                aten::_conv_depthwise2d         2.67%      23.519us         7.63%      67.130us      22.377us       7.231us        38.57%       7.231us       2.410us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.231us        38.57%       7.231us       2.410us             3  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.854us        31.22%       5.854us       1.951us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.664us        30.21%       5.664us       1.888us             3  
+                                Activity Buffer Request        29.52%     259.596us        29.52%     259.596us     259.596us       1.888us        10.07%       1.888us       1.888us             1  
+                                    aten::empty_strided         3.38%      29.730us         3.38%      29.730us       4.955us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        23.99%     210.946us        23.99%     210.946us      23.438us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.07%      18.190us         2.71%      23.871us       2.652us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.11%       9.761us         1.11%       9.761us       0.651us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.24%      10.890us         1.24%      10.890us       3.630us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.13%       9.920us         1.13%       9.920us       3.307us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.68%       5.972us         0.85%       7.452us       2.484us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 800.944us
-Self CUDA time total: 18.720us
+Self CPU time total: 879.393us
+Self CUDA time total: 18.749us
 
 
 
@@ -4633,29 +4633,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     324.666us      1676.91%     324.666us     324.666us             1  
-                                            torch_eager        15.17%     119.302us        99.37%     781.483us     781.483us       0.000us         0.00%      21.249us      21.249us             1  
-                                               aten::to         0.72%       5.648us        65.85%     517.928us      86.321us       0.000us         0.00%      13.345us       2.224us             6  
-                                         aten::_to_copy         2.87%      22.611us        65.14%     512.280us      85.380us       0.000us         0.00%      13.345us       2.224us             6  
-                                            aten::copy_         6.22%      48.900us        58.49%     460.037us      76.673us      11.457us        59.18%      13.345us       2.224us             6  
-                                           aten::conv1d         0.87%       6.869us        14.99%     117.911us      39.304us       0.000us         0.00%       7.904us       2.635us             3  
-                                      aten::convolution         1.27%      10.002us        14.12%     111.042us      37.014us       0.000us         0.00%       7.904us       2.635us             3  
-                                     aten::_convolution         2.89%      22.710us        12.85%     101.040us      33.680us       0.000us         0.00%       7.904us       2.635us             3  
-                                aten::_conv_depthwise2d         2.75%      21.590us         8.00%      62.920us      20.973us       7.904us        40.82%       7.904us       2.635us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.904us        40.82%       7.904us       2.635us             3  
-void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.825us        30.09%       5.825us       1.942us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.632us        29.09%       5.632us       1.877us             3  
-                                Activity Buffer Request        30.25%     237.875us        30.25%     237.875us     237.875us       1.888us         9.75%       1.888us       1.888us             1  
-                                    aten::empty_strided         3.77%      29.632us         3.77%      29.632us       4.939us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        24.87%     195.612us        24.87%     195.612us      21.735us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.14%      16.821us         2.78%      21.881us       2.431us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.08%       8.481us         1.08%       8.481us       0.565us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.22%       9.600us         1.22%       9.600us       3.200us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.19%       9.380us         1.19%       9.380us       3.127us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.75%       5.869us         0.93%       7.280us       2.427us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     338.934us      1741.87%     338.934us     338.934us             1  
+                                            torch_eager        16.71%     145.362us        99.29%     863.592us     863.592us       0.000us         0.00%      21.314us      21.314us             1  
+                                               aten::to         0.71%       6.200us        65.36%     568.524us      94.754us       0.000us         0.00%      13.282us       2.214us             6  
+                                         aten::_to_copy         2.85%      24.831us        64.65%     562.324us      93.721us       0.000us         0.00%      13.282us       2.214us             6  
+                                            aten::copy_         5.81%      50.550us        58.39%     507.883us      84.647us      11.426us        58.72%      13.282us       2.214us             6  
+                                           aten::conv1d         0.78%       6.753us        14.06%     122.315us      40.772us       0.000us         0.00%       8.032us       2.677us             3  
+                                      aten::convolution         1.19%      10.380us        13.29%     115.562us      38.521us       0.000us         0.00%       8.032us       2.677us             3  
+                                     aten::_convolution         2.63%      22.841us        12.09%     105.182us      35.061us       0.000us         0.00%       8.032us       2.677us             3  
+                                aten::_conv_depthwise2d         2.65%      23.042us         7.65%      66.512us      22.171us       8.032us        41.28%       8.032us       2.677us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       8.032us        41.28%       8.032us       2.677us             3  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.825us        29.94%       5.825us       1.942us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.601us        28.79%       5.601us       1.867us             3  
+                                Activity Buffer Request        30.62%     266.307us        30.62%     266.307us     266.307us       1.856us         9.54%       1.856us       1.856us             1  
+                                    aten::empty_strided         3.40%      29.610us         3.40%      29.610us       4.935us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        24.61%     214.076us        24.61%     214.076us      23.786us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.02%      17.612us         2.63%      22.841us       2.538us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.02%       8.840us         1.02%       8.840us       0.589us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.22%      10.630us         1.22%      10.630us       3.543us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.13%       9.790us         1.13%       9.790us       3.263us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.67%       5.798us         0.82%       7.109us       2.370us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 786.473us
-Self CUDA time total: 19.361us
+Self CPU time total: 869.783us
+Self CUDA time total: 19.458us
 
 
 
@@ -4665,29 +4665,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     328.865us      1704.41%     328.865us     328.865us             1  
-                                            torch_eager        14.92%     117.622us        99.37%     783.184us     783.184us       0.000us         0.00%      21.439us      21.439us             1  
-                                               aten::to         0.74%       5.810us        66.49%     524.079us      87.347us       0.000us         0.00%      14.207us       2.368us             6  
-                                         aten::_to_copy         3.01%      23.701us        65.75%     518.269us      86.378us       0.000us         0.00%      14.207us       2.368us             6  
-                                            aten::copy_         6.49%      51.190us        58.71%     462.718us      77.120us      12.063us        62.52%      14.207us       2.368us             6  
-                                           aten::conv1d         0.75%       5.890us        14.60%     115.093us      38.364us       0.000us         0.00%       7.232us       2.411us             3  
-                                      aten::convolution         1.22%       9.630us        13.86%     109.203us      36.401us       0.000us         0.00%       7.232us       2.411us             3  
-                                     aten::_convolution         2.83%      22.270us        12.63%      99.573us      33.191us       0.000us         0.00%       7.232us       2.411us             3  
-                                aten::_conv_depthwise2d         2.80%      22.070us         7.82%      61.673us      20.558us       7.232us        37.48%       7.232us       2.411us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.232us        37.48%       7.232us       2.411us             3  
-void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.240us        32.34%       6.240us       2.080us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.823us        30.18%       5.823us       1.941us             3  
-                                Activity Buffer Request        29.70%     234.095us        29.70%     234.095us     234.095us       2.144us        11.11%       2.144us       2.144us             1  
-                                    aten::empty_strided         4.04%      31.850us         4.04%      31.850us       5.308us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        25.25%     199.015us        25.25%     199.015us      22.113us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.15%      16.950us         2.78%      21.920us       2.436us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.05%       8.280us         1.05%       8.280us       0.552us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.22%       9.600us         1.22%       9.600us       3.200us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.07%       8.421us         1.07%       8.421us       2.807us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.76%       5.960us         0.92%       7.270us       2.423us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     340.862us      1751.78%     340.862us     340.862us             1  
+                                            torch_eager         8.44%     173.073us        99.74%       2.045ms       2.045ms       0.000us         0.00%      21.635us      21.635us             1  
+                                               aten::to         0.33%       6.670us        84.06%       1.723ms     287.196us       0.000us         0.00%      14.307us       2.385us             6  
+                                         aten::_to_copy         1.21%      24.883us        83.74%       1.717ms     286.084us       0.000us         0.00%      14.307us       2.385us             6  
+                                            aten::copy_         2.36%      48.471us        81.06%       1.662ms     276.949us      12.130us        62.34%      14.307us       2.385us             6  
+                                           aten::conv1d         0.29%       5.970us         5.84%     119.613us      39.871us       0.000us         0.00%       7.328us       2.443us             3  
+                                      aten::convolution         0.48%       9.780us         5.54%     113.643us      37.881us       0.000us         0.00%       7.328us       2.443us             3  
+                                     aten::_convolution         1.14%      23.420us         5.07%     103.863us      34.621us       0.000us         0.00%       7.328us       2.443us             3  
+                                aten::_conv_depthwise2d         1.10%      22.512us         3.15%      64.503us      21.501us       7.328us        37.66%       7.328us       2.443us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.328us        37.66%       7.328us       2.443us             3  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.241us        32.07%       6.241us       2.080us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.889us        30.27%       5.889us       1.963us             3  
+                                Activity Buffer Request        69.34%       1.421ms        69.34%       1.421ms       1.421ms       2.177us        11.19%       2.177us       2.177us             1  
+                                    aten::empty_strided         1.46%      29.930us         1.46%      29.930us       4.988us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.50%     215.256us        10.50%     215.256us      23.917us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.86%      17.669us         1.13%      23.180us       2.576us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.47%       9.581us         0.47%       9.581us       0.639us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.48%       9.759us         0.48%       9.759us       3.253us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.43%       8.742us         0.43%       8.742us       2.914us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.28%       5.760us         0.35%       7.110us       2.370us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 788.184us
-Self CUDA time total: 19.295us
+Self CPU time total: 2.050ms
+Self CUDA time total: 19.458us
 
 
 
@@ -4697,29 +4697,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     334.745us      1665.90%     334.745us     334.745us             1  
-                                            torch_eager        14.26%     118.712us        99.40%     827.395us     827.395us       0.000us         0.00%      22.270us      22.270us             1  
-                                               aten::to         0.70%       5.840us        67.41%     561.119us      93.520us       0.000us         0.00%      14.335us       2.389us             6  
-                                         aten::_to_copy         2.86%      23.780us        66.71%     555.279us      92.546us       0.000us         0.00%      14.335us       2.389us             6  
-                                            aten::copy_         6.22%      51.741us        60.26%     501.588us      83.598us      12.159us        60.51%      14.335us       2.389us             6  
-                                           aten::conv1d         0.81%       6.751us        14.52%     120.873us      40.291us       0.000us         0.00%       7.935us       2.645us             3  
-                                      aten::convolution         1.20%       9.989us        13.71%     114.122us      38.041us       0.000us         0.00%       7.935us       2.645us             3  
-                                     aten::_convolution         2.78%      23.181us        12.51%     104.133us      34.711us       0.000us         0.00%       7.935us       2.645us             3  
-                                aten::_conv_depthwise2d         2.64%      22.000us         7.72%      64.243us      21.414us       7.935us        39.49%       7.935us       2.645us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.935us        39.49%       7.935us       2.645us             3  
-void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.239us        31.05%       6.239us       2.080us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.920us        29.46%       5.920us       1.973us             3  
-                                Activity Buffer Request        32.59%     271.245us        32.59%     271.245us     271.245us       2.176us        10.83%       2.176us       2.176us             1  
-                                    aten::empty_strided         3.59%      29.911us         3.59%      29.911us       4.985us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        24.22%     201.614us        24.22%     201.614us      22.402us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.06%      17.131us         2.68%      22.291us       2.477us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.07%       8.900us         1.07%       8.900us       0.593us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.16%       9.640us         1.16%       9.640us       3.213us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.15%       9.591us         1.15%       9.591us       3.197us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.79%       6.549us         0.97%       8.109us       2.703us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     367.067us      1820.95%     367.067us     367.067us             1  
+                                            torch_eager        17.50%     145.595us        99.30%     826.111us     826.111us       0.000us         0.00%      22.366us      22.366us             1  
+                                               aten::to         0.75%       6.199us        63.72%     530.082us      88.347us       0.000us         0.00%      14.431us       2.405us             6  
+                                         aten::_to_copy         2.95%      24.573us        62.97%     523.883us      87.314us       0.000us         0.00%      14.431us       2.405us             6  
+                                            aten::copy_         6.31%      52.521us        56.15%     467.170us      77.862us      12.223us        60.64%      14.431us       2.405us             6  
+                                           aten::conv1d         0.69%       5.760us        14.59%     121.354us      40.451us       0.000us         0.00%       7.935us       2.645us             3  
+                                      aten::convolution         1.24%      10.281us        13.89%     115.594us      38.531us       0.000us         0.00%       7.935us       2.645us             3  
+                                     aten::_convolution         2.68%      22.269us        12.66%     105.313us      35.104us       0.000us         0.00%       7.935us       2.645us             3  
+                                aten::_conv_depthwise2d         2.73%      22.701us         8.02%      66.711us      22.237us       7.935us        39.36%       7.935us       2.645us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.935us        39.36%       7.935us       2.645us             3  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.304us        31.27%       6.304us       2.101us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.919us        29.36%       5.919us       1.973us             3  
+                                Activity Buffer Request        27.00%     224.665us        27.00%     224.665us     224.665us       2.208us        10.95%       2.208us       2.208us             1  
+                                    aten::empty_strided         3.86%      32.140us         3.86%      32.140us       5.357us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        25.71%     213.894us        25.71%     213.894us      23.766us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.05%      17.041us         2.71%      22.553us       2.506us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.14%       9.503us         1.14%       9.503us       0.634us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.31%      10.920us         1.31%      10.920us       3.640us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.10%       9.180us         1.10%       9.180us       3.060us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.81%       6.740us         0.98%       8.160us       2.720us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 832.395us
-Self CUDA time total: 20.094us
+Self CPU time total: 831.951us
+Self CUDA time total: 20.158us
 
 
 
@@ -4729,29 +4729,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     330.142us       918.64%     330.142us     330.142us             1  
-                                            torch_eager        14.68%     120.212us        99.34%     813.674us     813.674us       0.000us         0.00%      38.530us      38.530us             1  
-                                           aten::conv1d         0.79%       6.500us        14.15%     115.923us      38.641us       0.000us         0.00%      20.161us       6.720us             3  
-                                      aten::convolution         1.18%       9.650us        13.36%     109.423us      36.474us       0.000us         0.00%      20.161us       6.720us             3  
-                                     aten::_convolution         2.75%      22.509us        12.18%      99.773us      33.258us       0.000us         0.00%      20.161us       6.720us             3  
-                                aten::_conv_depthwise2d         2.55%      20.922us         7.56%      61.883us      20.628us      20.161us        56.10%      20.161us       6.720us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      20.161us        56.10%      20.161us       6.720us             3  
-                                               aten::to         0.72%       5.880us        67.15%     549.969us      91.661us       0.000us         0.00%      18.369us       3.061us             6  
-                                         aten::_to_copy         2.82%      23.099us        66.43%     544.089us      90.682us       0.000us         0.00%      18.369us       3.061us             6  
-                                            aten::copy_         6.44%      52.723us        59.97%     491.160us      81.860us      15.777us        43.90%      18.369us       3.061us             6  
-void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.448us        23.51%       8.448us       2.816us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.329us        20.39%       7.329us       2.443us             3  
-                                Activity Buffer Request        32.20%     263.764us        32.20%     263.764us     263.764us       2.592us         7.21%       2.592us       2.592us             1  
-                                    aten::empty_strided         3.64%      29.830us         3.64%      29.830us       4.972us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        24.00%     196.543us        24.00%     196.543us      21.838us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.14%      17.540us         2.77%      22.711us       2.523us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.07%       8.761us         1.07%       8.761us       0.584us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.21%       9.871us         1.21%       9.871us       3.290us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.13%       9.220us         1.13%       9.220us       3.073us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.68%       5.610us         0.85%       7.000us       2.333us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     363.100us      1005.93%     363.100us     363.100us             1  
+                                            torch_eager        14.77%     122.163us        99.35%     821.971us     821.971us       0.000us         0.00%      38.688us      38.688us             1  
+                                           aten::conv1d         0.72%       5.951us        17.29%     143.024us      47.675us       0.000us         0.00%      20.160us       6.720us             3  
+                                      aten::convolution         1.22%      10.110us        16.57%     137.073us      45.691us       0.000us         0.00%      20.160us       6.720us             3  
+                                     aten::_convolution         3.04%      25.151us        15.35%     126.963us      42.321us       0.000us         0.00%      20.160us       6.720us             3  
+                                aten::_conv_depthwise2d         4.80%      39.711us        10.31%      85.271us      28.424us      20.160us        55.85%      20.160us       6.720us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      20.160us        55.85%      20.160us       6.720us             3  
+                                               aten::to         0.75%       6.172us        63.79%     527.804us      87.967us       0.000us         0.00%      18.528us       3.088us             6  
+                                         aten::_to_copy         2.99%      24.751us        63.05%     521.632us      86.939us       0.000us         0.00%      18.528us       3.088us             6  
+                                            aten::copy_         6.14%      50.790us        56.45%     467.021us      77.837us      15.936us        44.15%      18.528us       3.088us             6  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.512us        23.58%       8.512us       2.837us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.424us        20.57%       7.424us       2.475us             3  
+                                Activity Buffer Request        27.93%     231.066us        27.93%     231.066us     231.066us       2.592us         7.18%       2.592us       2.592us             1  
+                                    aten::empty_strided         3.61%      29.860us         3.61%      29.860us       4.977us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        25.33%     209.585us        25.33%     209.585us      23.287us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.11%      17.441us         2.75%      22.791us       2.532us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.15%       9.501us         1.15%       9.501us       0.633us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.26%      10.400us         1.26%      10.400us       3.467us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.30%      10.740us         1.30%      10.740us       3.580us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.76%       6.269us         0.93%       7.730us       2.577us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 819.054us
-Self CUDA time total: 35.938us
+Self CPU time total: 827.381us
+Self CUDA time total: 36.096us
 
 
 
@@ -4761,29 +4761,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     330.946us       872.79%     330.946us     330.946us             1  
-                                            torch_eager         6.07%     120.841us        99.75%       1.987ms       1.987ms       0.000us         0.00%      40.478us      40.478us             1  
-                                           aten::conv1d         0.33%       6.510us         5.92%     117.833us      39.278us       0.000us         0.00%      22.271us       7.424us             3  
-                                      aten::convolution         0.49%       9.850us         5.59%     111.323us      37.108us       0.000us         0.00%      22.271us       7.424us             3  
-                                     aten::_convolution         1.11%      22.181us         5.10%     101.473us      33.824us       0.000us         0.00%      22.271us       7.424us             3  
-                                aten::_conv_depthwise2d         1.10%      21.811us         3.17%      63.042us      21.014us      22.271us        58.73%      22.271us       7.424us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      22.271us        58.73%      22.271us       7.424us             3  
-                                               aten::to         0.30%       5.981us        86.38%       1.720ms     286.727us       0.000us         0.00%      18.207us       3.034us             6  
-                                         aten::_to_copy         1.18%      23.522us        86.08%       1.714ms     285.730us       0.000us         0.00%      18.207us       3.034us             6  
-                                            aten::copy_         2.55%      50.829us        83.41%       1.661ms     276.860us      15.647us        41.27%      18.207us       3.034us             6  
-void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.320us        21.94%       8.320us       2.773us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.327us        19.32%       7.327us       2.442us             3  
-                                Activity Buffer Request        72.02%       1.434ms        72.02%       1.434ms       1.434ms       2.560us         6.75%       2.560us       2.560us             1  
-                                    aten::empty_strided         1.49%      29.700us         1.49%      29.700us       4.950us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         9.93%     197.835us         9.93%     197.835us      21.982us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.90%      17.980us         1.17%      23.390us       2.599us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.44%       8.840us         0.44%       8.840us       0.589us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.50%       9.970us         0.50%       9.970us       3.323us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.47%       9.410us         0.47%       9.410us       3.137us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.31%       6.110us         0.38%       7.490us       2.497us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     336.025us       883.88%     336.025us     336.025us             1  
+                                            torch_eager        14.70%     120.902us        99.36%     817.351us     817.351us       0.000us         0.00%      40.610us      40.610us             1  
+                                           aten::conv1d         0.71%       5.820us        14.44%     118.823us      39.608us       0.000us         0.00%      22.304us       7.435us             3  
+                                      aten::convolution         1.12%       9.190us        13.74%     113.003us      37.668us       0.000us         0.00%      22.304us       7.435us             3  
+                                     aten::_convolution         2.83%      23.270us        12.62%     103.813us      34.604us       0.000us         0.00%      22.304us       7.435us             3  
+                                aten::_conv_depthwise2d         2.83%      23.309us         7.79%      64.072us      21.357us      22.304us        58.67%      22.304us       7.435us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      22.304us        58.67%      22.304us       7.435us             3  
+                                               aten::to         0.73%       5.990us        66.75%     549.075us      91.513us       0.000us         0.00%      18.306us       3.051us             6  
+                                         aten::_to_copy         2.91%      23.953us        66.02%     543.085us      90.514us       0.000us         0.00%      18.306us       3.051us             6  
+                                            aten::copy_         6.07%      49.902us        59.57%     490.042us      81.674us      15.713us        41.33%      18.306us       3.051us             6  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.353us        21.97%       8.353us       2.784us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.360us        19.36%       7.360us       2.453us             3  
+                                Activity Buffer Request        30.85%     253.806us        30.85%     253.806us     253.806us       2.593us         6.82%       2.593us       2.593us             1  
+                                    aten::empty_strided         3.54%      29.090us         3.54%      29.090us       4.848us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        25.29%     208.074us        25.29%     208.074us      23.119us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.19%      18.051us         2.84%      23.371us       2.597us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.11%       9.160us         1.11%       9.160us       0.611us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.21%       9.961us         1.21%       9.961us       3.320us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.10%       9.062us         1.10%       9.062us       3.021us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.80%       6.580us         0.96%       7.920us       2.640us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.992ms
-Self CUDA time total: 37.918us
+Self CPU time total: 822.611us
+Self CUDA time total: 38.017us
 
 
 
@@ -4793,29 +4793,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     385.308us       602.34%     385.308us     385.308us             1  
-                                            torch_eager        14.42%     123.450us        99.41%     851.045us     851.045us       0.000us         0.00%      68.065us      68.065us             1  
-                                           aten::conv1d         0.67%       5.711us        13.49%     115.513us      38.504us       0.000us         0.00%      41.633us      13.878us             3  
-                                      aten::convolution         1.22%      10.470us        12.83%     109.802us      36.601us       0.000us         0.00%      41.633us      13.878us             3  
-                                     aten::_convolution         2.63%      22.491us        11.60%      99.332us      33.111us       0.000us         0.00%      41.633us      13.878us             3  
-                                aten::_conv_depthwise2d         2.49%      21.351us         7.22%      61.852us      20.617us      41.633us        65.08%      41.633us      13.878us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      41.633us        65.08%      41.633us      13.878us             3  
-                                               aten::to         0.71%       6.120us        68.08%     582.862us      97.144us       0.000us         0.00%      26.432us       4.405us             6  
-                                         aten::_to_copy         2.87%      24.611us        67.37%     576.742us      96.124us       0.000us         0.00%      26.432us       4.405us             6  
-                                            aten::copy_         6.21%      53.173us        60.75%     520.070us      86.678us      22.336us        34.92%      26.432us       4.405us             6  
-void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      11.936us        18.66%      11.936us       3.979us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.400us        16.26%      10.400us       3.467us             3  
-                                Activity Buffer Request        28.33%     242.554us        28.33%     242.554us     242.554us       4.096us         6.40%       4.096us       4.096us             1  
-                                    aten::empty_strided         3.74%      32.061us         3.74%      32.061us       5.344us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        28.79%     246.523us        28.79%     246.523us      27.391us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.02%      17.269us         2.63%      22.529us       2.503us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.08%       9.240us         1.08%       9.240us       0.616us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.11%       9.521us         1.11%       9.521us       3.174us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.03%       8.800us         1.03%       8.800us       2.933us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.68%       5.830us         0.84%       7.230us       2.410us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     335.486us       522.89%     335.486us     335.486us             1  
+                                            torch_eager        15.29%     123.163us        99.38%     800.491us     800.491us       0.000us         0.00%      68.256us      68.256us             1  
+                                           aten::conv1d         0.73%       5.840us        14.87%     119.763us      39.921us       0.000us         0.00%      41.760us      13.920us             3  
+                                      aten::convolution         1.21%       9.761us        14.14%     113.923us      37.974us       0.000us         0.00%      41.760us      13.920us             3  
+                                     aten::_convolution         2.84%      22.911us        12.93%     104.162us      34.721us       0.000us         0.00%      41.760us      13.920us             3  
+                                aten::_conv_depthwise2d         2.80%      22.570us         8.02%      64.572us      21.524us      41.760us        65.09%      41.760us      13.920us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      41.760us        65.09%      41.760us      13.920us             3  
+                                               aten::to         0.73%       5.842us        65.67%     528.904us      88.151us       0.000us         0.00%      26.496us       4.416us             6  
+                                         aten::_to_copy         2.94%      23.712us        64.94%     523.062us      87.177us       0.000us         0.00%      26.496us       4.416us             6  
+                                            aten::copy_         6.02%      48.492us        58.29%     469.521us      78.253us      22.400us        34.91%      26.496us       4.416us             6  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      11.968us        18.65%      11.968us       3.989us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.432us        16.26%      10.432us       3.477us             3  
+                                Activity Buffer Request        29.33%     236.206us        29.33%     236.206us     236.206us       4.096us         6.38%       4.096us       4.096us             1  
+                                    aten::empty_strided         3.70%      29.829us         3.70%      29.829us       4.971us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        25.91%     208.693us        25.91%     208.693us      23.188us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.18%      17.569us         2.86%      23.069us       2.563us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.14%       9.222us         1.14%       9.222us       0.615us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.20%       9.631us         1.20%       9.631us       3.210us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.06%       8.501us         1.06%       8.501us       2.834us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.83%       6.660us         0.99%       7.990us       2.663us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 856.136us
-Self CUDA time total: 63.969us
+Self CPU time total: 805.451us
+Self CUDA time total: 64.160us
 
 
 
@@ -4825,29 +4825,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     357.859us       513.70%     357.859us     357.859us             1  
-                                            torch_eager        20.53%     180.503us        99.40%     873.955us     873.955us       0.000us         0.00%      73.695us      73.695us             1  
-                                           aten::conv1d         0.63%       5.530us        15.78%     138.703us      46.234us       0.000us         0.00%      47.359us      15.786us             3  
-                                      aten::convolution         1.12%       9.840us        15.15%     133.173us      44.391us       0.000us         0.00%      47.359us      15.786us             3  
-                                     aten::_convolution         2.65%      23.331us        14.03%     123.333us      41.111us       0.000us         0.00%      47.359us      15.786us             3  
-                                aten::_conv_depthwise2d         2.63%      23.161us         9.53%      83.782us      27.927us      47.359us        67.98%      47.359us      15.786us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      47.359us        67.98%      47.359us      15.786us             3  
-                                               aten::to         0.72%       6.308us        59.85%     526.239us      87.707us       0.000us         0.00%      26.336us       4.389us             6  
-                                         aten::_to_copy         2.80%      24.578us        59.14%     519.931us      86.655us       0.000us         0.00%      26.336us       4.389us             6  
-                                            aten::copy_         6.12%      53.792us        52.84%     464.590us      77.432us      22.304us        32.02%      26.336us       4.389us             6  
-void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      11.840us        17.00%      11.840us       3.947us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.464us        15.02%      10.464us       3.488us             3  
-                                Activity Buffer Request        26.53%     233.244us        26.53%     233.244us     233.244us       4.032us         5.79%       4.032us       4.032us             1  
-                                    aten::empty_strided         3.50%      30.763us         3.50%      30.763us       5.127us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        22.92%     201.494us        22.92%     201.494us      22.388us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.03%      17.891us         2.67%      23.440us       2.604us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.06%       9.339us         1.06%       9.339us       0.623us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         2.95%      25.971us         2.95%      25.971us       8.657us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.22%      10.710us         1.22%      10.710us       3.570us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.71%       6.240us         0.88%       7.780us       2.593us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     340.218us       487.48%     340.218us     340.218us             1  
+                                            torch_eager        15.18%     124.853us        99.38%     817.682us     817.682us       0.000us         0.00%      73.887us      73.887us             1  
+                                           aten::conv1d         0.72%       5.910us        14.57%     119.903us      39.968us       0.000us         0.00%      47.328us      15.776us             3  
+                                      aten::convolution         1.21%       9.960us        13.86%     113.993us      37.998us       0.000us         0.00%      47.328us      15.776us             3  
+                                     aten::_convolution         2.81%      23.101us        12.64%     104.033us      34.678us       0.000us         0.00%      47.328us      15.776us             3  
+                                aten::_conv_depthwise2d         2.62%      21.561us         7.83%      64.432us      21.477us      47.328us        67.81%      47.328us      15.776us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      47.328us        67.81%      47.328us      15.776us             3  
+                                               aten::to         0.75%       6.180us        66.30%     545.475us      90.913us       0.000us         0.00%      26.559us       4.426us             6  
+                                         aten::_to_copy         2.97%      24.459us        65.55%     539.295us      89.882us       0.000us         0.00%      26.559us       4.426us             6  
+                                            aten::copy_         6.14%      50.491us        58.93%     484.862us      80.810us      22.463us        32.19%      26.559us       4.426us             6  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      12.032us        17.24%      12.032us       4.011us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.431us        14.95%      10.431us       3.477us             3  
+                                Activity Buffer Request        30.21%     248.576us        30.21%     248.576us     248.576us       4.096us         5.87%       4.096us       4.096us             1  
+                                    aten::empty_strided         3.64%      29.974us         3.64%      29.974us       4.996us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        25.32%     208.345us        25.32%     208.345us      23.149us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.09%      17.201us         2.72%      22.401us       2.489us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.11%       9.120us         1.11%       9.120us       0.608us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.32%      10.899us         1.32%      10.899us       3.633us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.15%       9.422us         1.15%       9.422us       3.141us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.80%       6.580us         0.98%       8.070us       2.690us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 879.215us
-Self CUDA time total: 69.663us
+Self CPU time total: 822.752us
+Self CUDA time total: 69.791us
 
 
 
@@ -4857,29 +4857,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     365.250us       197.10%     365.250us     365.250us             1  
-                                            torch_eager        14.70%     119.032us        99.37%     804.604us     804.604us       0.000us         0.00%     195.299us     195.299us             1  
-                                           aten::conv1d         0.95%       7.700us        17.22%     139.393us      46.464us       0.000us         0.00%     133.056us      44.352us             3  
-                                      aten::convolution         1.24%      10.040us        16.26%     131.693us      43.898us       0.000us         0.00%     133.056us      44.352us             3  
-                                     aten::_convolution         2.91%      23.550us        15.02%     121.653us      40.551us       0.000us         0.00%     133.056us      44.352us             3  
-                                aten::_conv_depthwise2d         2.69%      21.763us        10.08%      81.613us      27.204us     133.056us        71.80%     133.056us      44.352us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     133.056us        71.80%     133.056us      44.352us             3  
-                                               aten::to         0.75%       6.042us        64.10%     518.999us      86.500us       0.000us         0.00%      62.243us      10.374us             6  
-                                         aten::_to_copy         2.90%      23.470us        63.35%     512.957us      85.493us       0.000us         0.00%      62.243us      10.374us             6  
-                                            aten::copy_         6.35%      51.412us        56.59%     458.237us      76.373us      52.258us        28.20%      62.243us      10.374us             6  
-void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      29.250us        15.78%      29.250us       9.750us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      23.008us        12.42%      23.008us       7.669us             3  
-                                Activity Buffer Request        28.43%     230.213us        28.43%     230.213us     230.213us       9.985us         5.39%       9.985us       9.985us             1  
-                                    aten::empty_strided         3.86%      31.250us         3.86%      31.250us       5.208us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        26.81%     217.052us        26.81%     217.052us      24.117us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.10%      17.030us         2.74%      22.170us       2.463us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.13%       9.170us         1.13%       9.170us       0.611us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.22%       9.870us         1.22%       9.870us       3.290us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.18%       9.540us         1.18%       9.540us       3.180us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.78%       6.320us         1.00%       8.100us       2.700us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     357.276us       192.10%     357.276us     357.276us             1  
+                                            torch_eager         7.25%     148.445us        99.75%       2.043ms       2.043ms       0.000us         0.00%     196.063us     196.063us             1  
+                                           aten::conv1d         0.28%       5.714us         6.04%     123.725us      41.242us       0.000us         0.00%     133.535us      44.512us             3  
+                                      aten::convolution         0.50%      10.209us         5.76%     118.011us      39.337us       0.000us         0.00%     133.535us      44.512us             3  
+                                     aten::_convolution         1.22%      24.922us         5.26%     107.802us      35.934us       0.000us         0.00%     133.535us      44.512us             3  
+                                aten::_conv_depthwise2d         1.06%      21.740us         3.25%      66.540us      22.180us     133.535us        71.80%     133.535us      44.512us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     133.535us        71.80%     133.535us      44.512us             3  
+                                               aten::to         0.32%       6.558us        85.01%       1.741ms     290.215us       0.000us         0.00%      62.528us      10.421us             6  
+                                         aten::_to_copy         1.28%      26.242us        84.69%       1.735ms     289.122us       0.000us         0.00%      62.528us      10.421us             6  
+                                            aten::copy_         2.37%      48.539us        81.91%       1.678ms     279.634us      52.448us        28.20%      62.528us      10.421us             6  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      29.536us        15.88%      29.536us       9.845us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.912us        12.32%      22.912us       7.637us             3  
+                                Activity Buffer Request        70.45%       1.443ms        70.45%       1.443ms       1.443ms      10.080us         5.42%      10.080us      10.080us             1  
+                                    aten::empty_strided         1.50%      30.691us         1.50%      30.691us       5.115us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.22%     209.265us        10.22%     209.265us      23.252us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.93%      19.072us         1.20%      24.640us       2.738us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.45%       9.247us         0.45%       9.247us       0.616us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.55%      11.270us         0.55%      11.270us       3.757us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.51%      10.520us         0.51%      10.520us       3.507us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.29%       5.931us         0.35%       7.230us       2.410us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 809.694us
-Self CUDA time total: 185.314us
+Self CPU time total: 2.048ms
+Self CUDA time total: 185.983us
 
 
 
@@ -4889,29 +4889,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     352.824us       168.80%     352.824us     352.824us             1  
-                                            torch_eager        14.40%     121.160us        99.40%     836.424us     836.424us       0.000us         0.00%     222.266us     222.266us             1  
-                                           aten::conv1d         0.71%       5.981us        14.17%     119.243us      39.748us       0.000us         0.00%     153.724us      51.241us             3  
-                                      aten::convolution         1.17%       9.810us        13.46%     113.262us      37.754us       0.000us         0.00%     153.724us      51.241us             3  
-                                     aten::_convolution         2.76%      23.250us        12.29%     103.452us      34.484us       0.000us         0.00%     153.724us      51.241us             3  
-                                aten::_conv_depthwise2d         2.65%      22.340us         7.64%      64.321us      21.440us     153.724us        73.55%     153.724us      51.241us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     153.724us        73.55%     153.724us      51.241us             3  
-                                               aten::to         0.70%       5.880us        67.58%     568.691us      94.782us       0.000us         0.00%      68.542us      11.424us             6  
-                                         aten::_to_copy         2.81%      23.631us        66.88%     562.811us      93.802us       0.000us         0.00%      68.542us      11.424us             6  
-                                            aten::copy_         7.48%      62.921us        60.21%     506.640us      84.440us      55.294us        26.45%      68.542us      11.424us             6  
-void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      32.352us        15.48%      32.352us      10.784us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.942us        10.98%      22.942us       7.647us             3  
-                                Activity Buffer Request        31.88%     268.245us        31.88%     268.245us     268.245us      13.248us         6.34%      13.248us      13.248us             1  
-                                    aten::empty_strided         3.87%      32.540us         3.87%      32.540us       5.423us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        23.51%     197.824us        23.51%     197.824us      21.980us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.07%      17.378us         2.68%      22.521us       2.502us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.06%       8.883us         1.06%       8.883us       0.592us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.19%       9.991us         1.19%       9.991us       3.330us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.15%       9.640us         1.15%       9.640us       3.213us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.71%       5.990us         0.89%       7.470us       2.490us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     358.235us       170.21%     358.235us     358.235us             1  
+                                            torch_eager        15.50%     124.275us        99.34%     796.461us     796.461us       0.000us         0.00%     224.253us     224.253us             1  
+                                           aten::conv1d         0.70%       5.590us        14.78%     118.483us      39.494us       0.000us         0.00%     154.174us      51.391us             3  
+                                      aten::convolution         1.24%       9.921us        14.08%     112.893us      37.631us       0.000us         0.00%     154.174us      51.391us             3  
+                                     aten::_convolution         2.81%      22.549us        12.84%     102.972us      34.324us       0.000us         0.00%     154.174us      51.391us             3  
+                                aten::_conv_depthwise2d         2.82%      22.632us         8.11%      65.062us      21.687us     154.174us        73.26%     154.174us      51.391us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     154.174us        73.26%     154.174us      51.391us             3  
+                                               aten::to         0.74%       5.971us        65.46%     524.833us      87.472us       0.000us         0.00%      70.079us      11.680us             6  
+                                         aten::_to_copy         3.23%      25.880us        64.72%     518.862us      86.477us       0.000us         0.00%      70.079us      11.680us             6  
+                                            aten::copy_         6.33%      50.713us        57.67%     462.401us      77.067us      56.287us        26.74%      70.079us      11.680us             6  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      33.248us        15.80%      33.248us      11.083us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      23.039us        10.95%      23.039us       7.680us             3  
+                                Activity Buffer Request        28.19%     225.995us        28.19%     225.995us     225.995us      13.792us         6.55%      13.792us      13.792us             1  
+                                    aten::empty_strided         3.81%      30.581us         3.81%      30.581us       5.097us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        25.98%     208.263us        25.98%     208.263us      23.140us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.24%      17.992us         2.91%      23.301us       2.589us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.16%       9.309us         1.16%       9.309us       0.621us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.31%      10.480us         1.31%      10.480us       3.493us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.17%       9.380us         1.17%       9.380us       3.127us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.74%       5.910us         0.92%       7.370us       2.457us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 841.495us
-Self CUDA time total: 209.018us
+Self CPU time total: 801.751us
+Self CUDA time total: 210.461us
 
 
 
@@ -4921,29 +4921,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         6.78%     125.712us        53.74%     996.387us     996.387us       0.000us         0.00%       1.527ms       1.527ms             1  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.423ms       100.39%       1.423ms       1.423ms             1  
-                                               aten::to         0.35%       6.438us        38.84%     720.182us     120.030us       0.000us         0.00%     832.992us     138.832us             6  
-                                         aten::_to_copy         1.55%      28.691us        38.49%     713.744us     118.957us       0.000us         0.00%     832.992us     138.832us             6  
-                                            aten::copy_         2.90%      53.742us        26.33%     488.279us      81.380us     724.000us        51.06%     832.992us     138.832us             6  
-                                           aten::conv1d         0.38%       6.960us         6.55%     121.533us      40.511us       0.000us         0.00%     693.950us     231.317us             3  
-                                      aten::convolution         0.56%      10.430us         6.18%     114.573us      38.191us       0.000us         0.00%     693.950us     231.317us             3  
-                                     aten::_convolution         1.25%      23.268us         5.62%     104.143us      34.714us       0.000us         0.00%     693.950us     231.317us             3  
-                                aten::_conv_depthwise2d         1.23%      22.830us         3.48%      64.552us      21.517us     693.950us        48.94%     693.950us     231.317us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     693.950us        48.94%     693.950us     231.317us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     410.655us        28.96%     410.655us     136.885us             3  
-void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     313.345us        22.10%     313.345us     104.448us             3  
-                                Activity Buffer Request        13.73%     254.654us        13.73%     254.654us     254.654us     108.992us         7.69%     108.992us     108.992us             1  
-                                    aten::empty_strided         2.01%      37.271us        10.61%     196.774us      32.796us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        10.89%     201.884us        10.89%     201.884us      22.432us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.98%      18.223us         1.29%      23.933us       2.659us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.51%       9.490us         0.51%       9.490us       0.633us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.54%      10.101us         0.54%      10.101us       3.367us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.52%       9.620us         0.52%       9.620us       3.207us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.34%       6.270us         0.41%       7.680us       2.560us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         7.15%     131.473us        52.77%     970.085us     970.085us       0.000us         0.00%       1.521ms       1.521ms             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.421ms       100.40%       1.421ms       1.421ms             1  
+                                               aten::to         0.36%       6.571us        37.17%     683.219us     113.870us       0.000us         0.00%     824.180us     137.363us             6  
+                                         aten::_to_copy         1.61%      29.612us        36.81%     676.648us     112.775us       0.000us         0.00%     824.180us     137.363us             6  
+                                            aten::copy_         2.81%      51.569us        25.14%     462.051us      77.009us     718.613us        50.76%     824.180us     137.363us             6  
+                                           aten::conv1d         0.36%       6.680us         6.82%     125.423us      41.808us       0.000us         0.00%     696.981us     232.327us             3  
+                                      aten::convolution         0.57%      10.460us         6.46%     118.743us      39.581us       0.000us         0.00%     696.981us     232.327us             3  
+                                     aten::_convolution         1.31%      24.040us         5.89%     108.283us      36.094us       0.000us         0.00%     696.981us     232.327us             3  
+                                aten::_conv_depthwise2d         1.25%      22.981us         3.69%      67.913us      22.638us     696.981us        49.24%     696.981us     232.327us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     696.981us        49.24%     696.981us     232.327us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     410.458us        29.00%     410.458us     136.819us             3  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     308.155us        21.77%     308.155us     102.718us             3  
+                                Activity Buffer Request        11.91%     218.936us        11.91%     218.936us     218.936us     105.567us         7.46%     105.567us     105.567us             1  
+                                    aten::empty_strided         2.01%      37.011us        10.06%     184.985us      30.831us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        11.74%     215.777us        11.74%     215.777us      23.975us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.99%      18.200us         1.31%      24.000us       2.667us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.53%       9.740us         0.53%       9.740us       0.649us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.59%      10.839us         0.59%      10.839us       3.613us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.54%       9.862us         0.54%       9.862us       3.287us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.34%       6.240us         0.42%       7.700us       2.567us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.854ms
-Self CUDA time total: 1.418ms
+Self CPU time total: 1.838ms
+Self CUDA time total: 1.416ms
 
 
 
@@ -4953,109 +4953,57 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         4.03%     122.972us        65.43%       1.999ms       1.999ms       0.000us         0.00%       1.502ms       1.502ms             1  
+                                            torch_eager         6.74%     124.615us        43.66%     806.720us     806.720us       0.000us         0.00%       1.502ms       1.502ms             1  
                                             torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.433ms       100.41%       1.433ms       1.433ms             1  
-                                               aten::to         0.19%       5.740us        56.63%       1.730ms     288.331us       0.000us         0.00%     766.432us     127.739us             6  
-                                         aten::_to_copy         0.79%      24.119us        56.45%       1.724ms     287.375us       0.000us         0.00%     766.432us     127.739us             6  
-                                            aten::copy_         1.70%      52.020us        54.70%       1.671ms     278.493us     691.168us        48.43%     766.432us     127.739us             6  
-                                           aten::conv1d         0.23%       6.891us         3.86%     118.002us      39.334us       0.000us         0.00%     736.031us     245.344us             3  
-                                      aten::convolution         0.33%       9.930us         3.64%     111.111us      37.037us       0.000us         0.00%     736.031us     245.344us             3  
-                                     aten::_convolution         0.74%      22.558us         3.31%     101.181us      33.727us       0.000us         0.00%     736.031us     245.344us             3  
-                                aten::_conv_depthwise2d         0.70%      21.291us         2.07%      63.232us      21.077us     736.031us        51.57%     736.031us     245.344us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     736.031us        51.57%     736.031us     245.344us             3  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     401.120us        28.11%     401.120us     133.707us             3  
-void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     290.048us        20.32%     290.048us      96.683us             3  
-                                Activity Buffer Request        47.17%       1.441ms        47.17%       1.441ms       1.441ms      75.264us         5.27%      75.264us      75.264us             1  
-                                    aten::empty_strided         0.95%      29.171us         0.95%      29.171us       4.862us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         6.58%     201.084us         6.58%     201.084us      22.343us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.57%      17.550us         0.75%      22.971us       2.552us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.30%       9.131us         0.30%       9.131us       0.609us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.33%       9.960us         0.33%       9.960us       3.320us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.30%       9.060us         0.30%       9.060us       3.020us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.18%       5.561us         0.23%       7.041us       2.347us       0.000us         0.00%       0.000us       0.000us             3  
+                                               aten::to         0.34%       6.269us        28.35%     523.751us      87.292us       0.000us         0.00%     764.786us     127.464us             6  
+                                         aten::_to_copy         1.27%      23.480us        28.01%     517.482us      86.247us       0.000us         0.00%     764.786us     127.464us             6  
+                                            aten::copy_         2.74%      50.661us        25.15%     464.712us      77.452us     690.099us        48.36%     764.786us     127.464us             6  
+                                           aten::conv1d         0.32%       5.870us         7.00%     129.374us      43.125us       0.000us         0.00%     737.040us     245.680us             3  
+                                      aten::convolution         0.54%       9.999us         6.68%     123.504us      41.168us       0.000us         0.00%     737.040us     245.680us             3  
+                                     aten::_convolution         1.31%      24.293us         6.14%     113.505us      37.835us       0.000us         0.00%     737.040us     245.680us             3  
+                                aten::_conv_depthwise2d         1.62%      30.010us         3.95%      73.060us      24.353us     737.040us        51.64%     737.040us     245.680us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     737.040us        51.64%     737.040us     245.680us             3  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     399.673us        28.01%     399.673us     133.224us             3  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     290.426us        20.35%     290.426us      96.809us             3  
+                                Activity Buffer Request        12.15%     224.466us        12.15%     224.466us     224.466us      74.687us         5.23%      74.687us      74.687us             1  
+                                    aten::empty_strided         1.59%      29.290us         1.59%      29.290us       4.882us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        11.52%     212.785us        11.52%     212.785us      23.643us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.94%      17.281us         1.23%      22.771us       2.530us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.55%      10.081us         0.55%      10.081us       0.672us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.57%      10.440us         0.57%      10.440us       3.480us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.51%       9.410us         0.51%       9.410us       3.137us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.33%       6.150us         0.41%       7.641us       2.547us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 3.055ms
+Self CPU time total: 1.848ms
 Self CUDA time total: 1.427ms
 
 
 impl                     wl                  p50(ms)  ok
 torch_eager              cuda_B2_D2048_S128_W2     0.09  True
 torch_eager              cuda_B2_D2048_S128_W4     0.08  True
-torch_eager              cuda_B2_D2048_S2048_W2     0.14  True
+torch_eager              cuda_B2_D2048_S2048_W2     0.15  True
 torch_eager              cuda_B2_D2048_S2048_W4     0.16  True
 torch_eager              cuda_B2_D2048_S512_W2     0.09  True
-torch_eager              cuda_B2_D2048_S512_W4     0.08  True
+torch_eager              cuda_B2_D2048_S512_W4     0.09  True
 torch_eager              cuda_B2_D64_S128_W2     0.07  True
 torch_eager              cuda_B2_D64_S128_W4     0.09  True
 torch_eager              cuda_B2_D64_S2048_W2     0.09  True
-torch_eager              cuda_B2_D64_S2048_W4     0.08  True
+torch_eager              cuda_B2_D64_S2048_W4     0.09  True
 torch_eager              cuda_B2_D64_S512_W2     0.09  True
 torch_eager              cuda_B2_D64_S512_W4     0.09  True
-torch_eager              cuda_B4_D2048_S128_W2     0.08  True
-torch_eager              cuda_B4_D2048_S128_W4     0.08  True
+torch_eager              cuda_B4_D2048_S128_W2     0.09  True
+torch_eager              cuda_B4_D2048_S128_W4     0.09  True
 torch_eager              cuda_B4_D2048_S2048_W2     0.49  True
 torch_eager              cuda_B4_D2048_S2048_W4     0.50  True
-torch_eager              cuda_B4_D2048_S512_W2     0.09  True
+torch_eager              cuda_B4_D2048_S512_W2     0.10  True
 torch_eager              cuda_B4_D2048_S512_W4     0.10  True
-torch_eager              cuda_B4_D64_S128_W2     0.08  True
+torch_eager              cuda_B4_D64_S128_W2     0.09  True
 torch_eager              cuda_B4_D64_S128_W4     0.08  True
-torch_eager              cuda_B4_D64_S2048_W2     0.08  True
-torch_eager              cuda_B4_D64_S2048_W4     0.08  True
-torch_eager              cuda_B4_D64_S512_W2     0.08  True
-torch_eager              cuda_B4_D64_S512_W4     0.08  True
+torch_eager              cuda_B4_D64_S2048_W2     0.09  True
+torch_eager              cuda_B4_D64_S2048_W4     0.09  True
+torch_eager              cuda_B4_D64_S512_W2     0.09  True
+torch_eager              cuda_B4_D64_S512_W4     0.09  True
 </pre></div>
-<div class="uv-install-logs" id="uv-logs-benchmark">
-<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
-<div class="uv-logs-content" style="display: none;">
-   Building kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
-Downloading pillow (6.7MiB)
-Downloading fonttools (4.7MiB)
-Downloading setuptools (1.1MiB)
-Downloading nvidia-cufft-cu12 (184.2MiB)
-Downloading nvidia-cusparselt-cu12 (273.9MiB)
-Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
-Downloading nvidia-cublas-cu12 (566.8MiB)
-Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
-Downloading nvidia-cusparse-cu12 (274.9MiB)
-Downloading nvidia-cudnn-cu12 (674.0MiB)
-Downloading nvidia-cusolver-cu12 (255.1MiB)
-Downloading sympy (6.0MiB)
-Downloading nvidia-cufile-cu12 (1.1MiB)
-Downloading networkx (1.9MiB)
-Downloading numpy (16.2MiB)
-Downloading nvidia-curand-cu12 (60.7MiB)
-Downloading nvidia-nvjitlink-cu12 (37.4MiB)
-Downloading kiwisolver (1.4MiB)
-Downloading matplotlib (8.3MiB)
-Downloading nvidia-nccl-cu12 (307.4MiB)
-Downloading torch (846.9MiB)
-Downloading triton (148.3MiB)
- Downloading nvidia-cufile-cu12
- Downloading kiwisolver
- Downloading setuptools
- Downloading fonttools
- Downloading networkx
- Downloading pillow
-      Built kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
- Downloading nvidia-cuda-cupti-cu12
- Downloading matplotlib
- Downloading numpy
- Downloading sympy
- Downloading nvidia-nvjitlink-cu12
- Downloading nvidia-curand-cu12
- Downloading nvidia-cuda-nvrtc-cu12
- Downloading triton
- Downloading nvidia-cufft-cu12
- Downloading nvidia-cusolver-cu12
- Downloading nvidia-cusparse-cu12
- Downloading nvidia-cusparselt-cu12
- Downloading nvidia-nccl-cu12
- Downloading nvidia-cublas-cu12
- Downloading nvidia-cudnn-cu12
- Downloading torch
-Installed 37 packages in 218ms
-</div>
-</div>
 <div class="cell-artifacts">
 <h4>Artifacts:</h4>
 <a href="artifacts/benchmark/causal_conv1d.jsonl" class="artifact" target="_blank">causal_conv1d.jsonl</a>
diff --git a/causal_conv1d/results/artifacts/combine/latency.svg b/causal_conv1d/results/artifacts/combine/latency.svg
index 9b058d2666ce3f17f1e0271794e89c52b55a50d5..1051764b171c27ddd8f8651b286d107eb666bd69 100644
--- a/causal_conv1d/results/artifacts/combine/latency.svg
+++ b/causal_conv1d/results/artifacts/combine/latency.svg
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cf8858bb054bd7e8f82af77fd05a6475b7ee3a9a335ba4a6506cd1c694804777
+oid sha256:6fdf61512b0add92f3d8e4a284ecb814f7a3b11b2db0fe3af610896a05d7072f
 size 35426
diff --git a/causal_conv1d/results/combined_results.html b/causal_conv1d/results/combined_results.html
index 478077209c7e2fef5044dc68f9a6ef240e0167c9..6a99b42f98995858e618176be6ad4beb1b59c2c4 100644
--- a/causal_conv1d/results/combined_results.html
+++ b/causal_conv1d/results/combined_results.html
@@ -4107,7 +4107,7 @@ body[data-tool="eraser"] .main-content {
   <rdf:RDF>
    <ns2:Work>
     <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
-    <dc:date>2025-10-30T15:53:58.349427</dc:date>
+    <dc:date>2025-10-31T20:14:05.716143</dc:date>
     <dc:format>image/svg+xml</dc:format>
     <dc:creator>
      <ns2:Agent>
@@ -4451,70 +4451,70 @@ body[data-tool="eraser"] .main-content {
    <g id="matplotlib.axis_2">
     <g id="ytick_1">
      <g id="grid-y--2" class="grid grid-y">
-      <path d="M 47.72 375.197972  L 831.034248 375.197972  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 375.317309  L 831.034248 375.317309  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_25">
       <defs>
        <path id="m0fca2865ba" d="M 0 0  L -3.5 0  " style="stroke: #000000; stroke-width: 0.8" />
       </defs>
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="375.197972" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="375.317309" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_25">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="378.997191" transform="rotate(-0 40.72 378.997191)">0.1</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="379.116528" transform="rotate(-0 40.72 379.116528)">0.1</text>
      </g>
     </g>
     <g id="ytick_2">
      <g id="grid-y--3" class="grid grid-y">
-      <path d="M 47.72 292.404953  L 831.034248 292.404953  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 292.576412  L 831.034248 292.576412  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_26">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="292.404953" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="292.576412" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_26">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="296.204172" transform="rotate(-0 40.72 296.204172)">0.2</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="296.375631" transform="rotate(-0 40.72 296.375631)">0.2</text>
      </g>
     </g>
     <g id="ytick_3">
      <g id="grid-y--4" class="grid grid-y">
-      <path d="M 47.72 209.611934  L 831.034248 209.611934  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 209.835514  L 831.034248 209.835514  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_27">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="209.611934" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="209.835514" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_27">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="213.411153" transform="rotate(-0 40.72 213.411153)">0.3</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="213.634733" transform="rotate(-0 40.72 213.634733)">0.3</text>
      </g>
     </g>
     <g id="ytick_4">
      <g id="grid-y--5" class="grid grid-y">
-      <path d="M 47.72 126.818915  L 831.034248 126.818915  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 127.094617  L 831.034248 127.094617  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_28">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="126.818915" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="127.094617" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_28">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="130.618134" transform="rotate(-0 40.72 130.618134)">0.4</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="130.893835" transform="rotate(-0 40.72 130.893835)">0.4</text>
      </g>
     </g>
     <g id="ytick_5">
      <g id="grid-y--6" class="grid grid-y">
-      <path d="M 47.72 44.025896  L 831.034248 44.025896  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 44.353719  L 831.034248 44.353719  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_29">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="44.025896" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="44.353719" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_29">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="47.825115" transform="rotate(-0 40.72 47.825115)">0.5</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="48.152938" transform="rotate(-0 40.72 48.152938)">0.5</text>
      </g>
     </g>
     <g id="label--y" class="ylabel">
@@ -4522,66 +4522,66 @@ body[data-tool="eraser"] .main-content {
     </g>
    </g>
    <g id="series--hf-kernels-causal-conv1d" class="series">
-    <path d="M 83.325193 420.186871  L 114.286231 412.876247  L 145.247268 415.070262  L 176.208306 415.078541  L 207.169343 415.235848  L 238.130381 416.63505  L 269.091418 416.444626  L 300.052455 415.56702  L 331.013493 415.889913  L 361.97453 415.989265  L 392.935568 415.633255  L 423.896605 415.119938  L 454.857643 414.912955  L 485.81868 415.285524  L 516.779718 414.773035  L 547.740755 413.704177  L 578.701793 415.831958  L 609.66283 415.989265  L 640.623868 416.03894  L 671.584905 416.204526  L 702.545943 414.879838  L 733.50698 415.351758  L 764.468018 416.295599  L 795.429055 414.97919  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 83.325193 420.186871  L 114.286231 415.032113  L 145.247268 414.700322  L 176.208306 414.808712  L 207.169343 415.503736  L 238.130381 416.677829  L 269.091418 416.322043  L 300.052455 416.2815  L 331.013493 416.364241  L 361.97453 415.247239  L 392.935568 416.78622  L 423.896605 416.686103  L 454.857643 416.223582  L 485.81868 417.390228  L 516.779718 415.991907  L 547.740755 415.279508  L 578.701793 415.702314  L 609.66283 416.082095  L 640.623868 416.173937  L 671.584905 415.884344  L 702.545943 416.157389  L 733.50698 416.115191  L 764.468018 416.686103  L 795.429055 415.967085  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="md7efaf3aec" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #1f77b4" />
     </defs>
     <g clip-path="url(#pb49fc4c8d2)">
      <use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="114.286231" y="412.876247" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="145.247268" y="415.070262" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="176.208306" y="415.078541" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="207.169343" y="415.235848" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="238.130381" y="416.63505" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="269.091418" y="416.444626" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="300.052455" y="415.56702" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="331.013493" y="415.889913" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="361.97453" y="415.989265" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="392.935568" y="415.633255" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="423.896605" y="415.119938" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="454.857643" y="414.912955" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="485.81868" y="415.285524" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="516.779718" y="414.773035" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="547.740755" y="413.704177" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="578.701793" y="415.831958" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="609.66283" y="415.989265" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="640.623868" y="416.03894" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="671.584905" y="416.204526" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="702.545943" y="414.879838" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="733.50698" y="415.351758" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="764.468018" y="416.295599" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="795.429055" y="414.97919" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="114.286231" y="415.032113" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="145.247268" y="414.700322" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="176.208306" y="414.808712" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="207.169343" y="415.503736" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="238.130381" y="416.677829" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="269.091418" y="416.322043" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="300.052455" y="416.2815" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="331.013493" y="416.364241" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="361.97453" y="415.247239" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="392.935568" y="416.78622" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="423.896605" y="416.686103" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="454.857643" y="416.223582" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="485.81868" y="417.390228" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="516.779718" y="415.991907" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="547.740755" y="415.279508" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="578.701793" y="415.702314" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="609.66283" y="416.082095" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="640.623868" y="416.173937" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="671.584905" y="415.884344" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="702.545943" y="416.157389" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="733.50698" y="416.115191" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="764.468018" y="416.686103" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="795.429055" y="415.967085" style="fill: #1f77b4; stroke: #1f77b4" />
     </g>
    </g>
    <g id="series--torch-eager" class="series">
-    <path d="M 83.325193 398.743479  L 114.286231 386.307139  L 145.247268 385.761533  L 176.208306 386.431329  L 207.169343 387.118511  L 238.130381 389.975698  L 269.091418 385.612506  L 300.052455 387.674052  L 331.013493 387.276646  L 361.97453 388.402631  L 392.935568 338.137333  L 423.896605 324.393692  L 454.857643 388.923399  L 485.81868 389.229733  L 516.779718 389.570012  L 547.740755 388.501982  L 578.701793 388.517713  L 609.66283 388.55083  L 640.623868 388.518541  L 671.584905 388.609613  L 702.545943 379.783049  L 733.50698 374.2028  L 764.468018 54.691293  L 795.429055 45.608899  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 83.325193 399.368433  L 114.286231 384.590909  L 145.247268 385.069979  L 176.208306 385.98923  L 207.169343 386.154712  L 238.130381 385.815474  L 269.091418 387.619226  L 300.052455 388.023829  L 331.013493 386.783543  L 361.97453 386.047149  L 392.935568 337.468313  L 423.896605 323.758146  L 454.857643 387.577855  L 485.81868 388.03293  L 516.779718 387.569581  L 547.740755 387.180699  L 578.701793 387.519109  L 609.66283 386.12989  L 640.623868 386.808365  L 671.584905 386.419483  L 702.545943 379.13001  L 733.50698 373.727029  L 764.468018 53.453563  L 795.429055 45.608899  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="m9b8c54d372" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #ff7f0e" />
     </defs>
     <g clip-path="url(#pb49fc4c8d2)">
-     <use ns4:href="#m9b8c54d372" x="83.325193" y="398.743479" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="114.286231" y="386.307139" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="145.247268" y="385.761533" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="176.208306" y="386.431329" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="207.169343" y="387.118511" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="238.130381" y="389.975698" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="269.091418" y="385.612506" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="300.052455" y="387.674052" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="331.013493" y="387.276646" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="361.97453" y="388.402631" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="392.935568" y="338.137333" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="423.896605" y="324.393692" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="454.857643" y="388.923399" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="485.81868" y="389.229733" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="516.779718" y="389.570012" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="547.740755" y="388.501982" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="578.701793" y="388.517713" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="609.66283" y="388.55083" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="640.623868" y="388.518541" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="671.584905" y="388.609613" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="702.545943" y="379.783049" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="733.50698" y="374.2028" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="764.468018" y="54.691293" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="83.325193" y="399.368433" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="114.286231" y="384.590909" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="145.247268" y="385.069979" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="176.208306" y="385.98923" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="207.169343" y="386.154712" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="238.130381" y="385.815474" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="269.091418" y="387.619226" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="300.052455" y="388.023829" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="331.013493" y="386.783543" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="361.97453" y="386.047149" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="392.935568" y="337.468313" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="423.896605" y="323.758146" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="454.857643" y="387.577855" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="485.81868" y="388.03293" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="516.779718" y="387.569581" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="547.740755" y="387.180699" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="578.701793" y="387.519109" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="609.66283" y="386.12989" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="640.623868" y="386.808365" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="671.584905" y="386.419483" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="702.545943" y="379.13001" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="733.50698" y="373.727029" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="764.468018" y="53.453563" style="fill: #ff7f0e; stroke: #ff7f0e" />
      <use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
     </g>
    </g>
@@ -4640,7 +4640,7 @@ body[data-tool="eraser"] .main-content {
 <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
 </span> | 
-Cell: combine | 4.38s
+Cell: combine | 4.43s
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
 <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -4753,28 +4753,28 @@ hf_kernels_causal_conv1d cuda_B4_D64_S512_W2     0.05  True
 hf_kernels_causal_conv1d cuda_B4_D64_S512_W4     0.05  True
 torch_eager              cuda_B2_D2048_S128_W2     0.09  True
 torch_eager              cuda_B2_D2048_S128_W4     0.08  True
-torch_eager              cuda_B2_D2048_S2048_W2     0.14  True
+torch_eager              cuda_B2_D2048_S2048_W2     0.15  True
 torch_eager              cuda_B2_D2048_S2048_W4     0.16  True
 torch_eager              cuda_B2_D2048_S512_W2     0.09  True
-torch_eager              cuda_B2_D2048_S512_W4     0.08  True
+torch_eager              cuda_B2_D2048_S512_W4     0.09  True
 torch_eager              cuda_B2_D64_S128_W2     0.07  True
 torch_eager              cuda_B2_D64_S128_W4     0.09  True
 torch_eager              cuda_B2_D64_S2048_W2     0.09  True
-torch_eager              cuda_B2_D64_S2048_W4     0.08  True
+torch_eager              cuda_B2_D64_S2048_W4     0.09  True
 torch_eager              cuda_B2_D64_S512_W2     0.09  True
 torch_eager              cuda_B2_D64_S512_W4     0.09  True
-torch_eager              cuda_B4_D2048_S128_W2     0.08  True
-torch_eager              cuda_B4_D2048_S128_W4     0.08  True
+torch_eager              cuda_B4_D2048_S128_W2     0.09  True
+torch_eager              cuda_B4_D2048_S128_W4     0.09  True
 torch_eager              cuda_B4_D2048_S2048_W2     0.49  True
 torch_eager              cuda_B4_D2048_S2048_W4     0.50  True
-torch_eager              cuda_B4_D2048_S512_W2     0.09  True
+torch_eager              cuda_B4_D2048_S512_W2     0.10  True
 torch_eager              cuda_B4_D2048_S512_W4     0.10  True
-torch_eager              cuda_B4_D64_S128_W2     0.08  True
+torch_eager              cuda_B4_D64_S128_W2     0.09  True
 torch_eager              cuda_B4_D64_S128_W4     0.08  True
-torch_eager              cuda_B4_D64_S2048_W2     0.08  True
-torch_eager              cuda_B4_D64_S2048_W4     0.08  True
-torch_eager              cuda_B4_D64_S512_W2     0.08  True
-torch_eager              cuda_B4_D64_S512_W4     0.08  True
+torch_eager              cuda_B4_D64_S2048_W2     0.09  True
+torch_eager              cuda_B4_D64_S2048_W4     0.09  True
+torch_eager              cuda_B4_D64_S512_W2     0.09  True
+torch_eager              cuda_B4_D64_S512_W4     0.09  True
 
 GENERATING COMBINED VISUALIZATION
 
@@ -4794,7 +4794,7 @@ Implementations included:
 <div class="uv-install-logs" id="uv-logs-combine">
 <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 <div class="uv-logs-content" style="display: none;">
-Installed 37 packages in 211ms
+Installed 37 packages in 238ms
 </div>
 </div>
 <div class="cell-artifacts">
@@ -4807,7 +4807,7 @@ Installed 37 packages in 211ms
   <rdf:RDF>
    <ns2:Work>
     <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
-    <dc:date>2025-10-30T15:53:58.349427</dc:date>
+    <dc:date>2025-10-31T20:14:05.716143</dc:date>
     <dc:format>image/svg+xml</dc:format>
     <dc:creator>
      <ns2:Agent>
@@ -5151,70 +5151,70 @@ Installed 37 packages in 211ms
    <g id="matplotlib.axis_2">
     <g id="ytick_1">
      <g id="grid-y--2" class="grid grid-y">
-      <path d="M 47.72 375.197972  L 831.034248 375.197972  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 375.317309  L 831.034248 375.317309  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_25">
       <defs>
        <path id="m0fca2865ba" d="M 0 0  L -3.5 0  " style="stroke: #000000; stroke-width: 0.8" />
       </defs>
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="375.197972" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="375.317309" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_25">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="378.997191" transform="rotate(-0 40.72 378.997191)">0.1</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="379.116528" transform="rotate(-0 40.72 379.116528)">0.1</text>
      </g>
     </g>
     <g id="ytick_2">
      <g id="grid-y--3" class="grid grid-y">
-      <path d="M 47.72 292.404953  L 831.034248 292.404953  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 292.576412  L 831.034248 292.576412  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_26">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="292.404953" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="292.576412" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_26">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="296.204172" transform="rotate(-0 40.72 296.204172)">0.2</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="296.375631" transform="rotate(-0 40.72 296.375631)">0.2</text>
      </g>
     </g>
     <g id="ytick_3">
      <g id="grid-y--4" class="grid grid-y">
-      <path d="M 47.72 209.611934  L 831.034248 209.611934  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 209.835514  L 831.034248 209.835514  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_27">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="209.611934" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="209.835514" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_27">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="213.411153" transform="rotate(-0 40.72 213.411153)">0.3</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="213.634733" transform="rotate(-0 40.72 213.634733)">0.3</text>
      </g>
     </g>
     <g id="ytick_4">
      <g id="grid-y--5" class="grid grid-y">
-      <path d="M 47.72 126.818915  L 831.034248 126.818915  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 127.094617  L 831.034248 127.094617  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_28">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="126.818915" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="127.094617" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_28">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="130.618134" transform="rotate(-0 40.72 130.618134)">0.4</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="130.893835" transform="rotate(-0 40.72 130.893835)">0.4</text>
      </g>
     </g>
     <g id="ytick_5">
      <g id="grid-y--6" class="grid grid-y">
-      <path d="M 47.72 44.025896  L 831.034248 44.025896  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 44.353719  L 831.034248 44.353719  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_29">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="44.025896" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="44.353719" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_29">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="47.825115" transform="rotate(-0 40.72 47.825115)">0.5</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="48.152938" transform="rotate(-0 40.72 48.152938)">0.5</text>
      </g>
     </g>
     <g id="label--y" class="ylabel">
@@ -5222,66 +5222,66 @@ Installed 37 packages in 211ms
     </g>
    </g>
    <g id="series--hf-kernels-causal-conv1d" class="series">
-    <path d="M 83.325193 420.186871  L 114.286231 412.876247  L 145.247268 415.070262  L 176.208306 415.078541  L 207.169343 415.235848  L 238.130381 416.63505  L 269.091418 416.444626  L 300.052455 415.56702  L 331.013493 415.889913  L 361.97453 415.989265  L 392.935568 415.633255  L 423.896605 415.119938  L 454.857643 414.912955  L 485.81868 415.285524  L 516.779718 414.773035  L 547.740755 413.704177  L 578.701793 415.831958  L 609.66283 415.989265  L 640.623868 416.03894  L 671.584905 416.204526  L 702.545943 414.879838  L 733.50698 415.351758  L 764.468018 416.295599  L 795.429055 414.97919  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 83.325193 420.186871  L 114.286231 415.032113  L 145.247268 414.700322  L 176.208306 414.808712  L 207.169343 415.503736  L 238.130381 416.677829  L 269.091418 416.322043  L 300.052455 416.2815  L 331.013493 416.364241  L 361.97453 415.247239  L 392.935568 416.78622  L 423.896605 416.686103  L 454.857643 416.223582  L 485.81868 417.390228  L 516.779718 415.991907  L 547.740755 415.279508  L 578.701793 415.702314  L 609.66283 416.082095  L 640.623868 416.173937  L 671.584905 415.884344  L 702.545943 416.157389  L 733.50698 416.115191  L 764.468018 416.686103  L 795.429055 415.967085  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="md7efaf3aec" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #1f77b4" />
     </defs>
     <g clip-path="url(#pb49fc4c8d2)">
      <use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="114.286231" y="412.876247" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="145.247268" y="415.070262" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="176.208306" y="415.078541" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="207.169343" y="415.235848" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="238.130381" y="416.63505" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="269.091418" y="416.444626" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="300.052455" y="415.56702" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="331.013493" y="415.889913" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="361.97453" y="415.989265" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="392.935568" y="415.633255" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="423.896605" y="415.119938" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="454.857643" y="414.912955" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="485.81868" y="415.285524" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="516.779718" y="414.773035" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="547.740755" y="413.704177" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="578.701793" y="415.831958" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="609.66283" y="415.989265" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="640.623868" y="416.03894" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="671.584905" y="416.204526" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="702.545943" y="414.879838" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="733.50698" y="415.351758" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="764.468018" y="416.295599" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="795.429055" y="414.97919" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="114.286231" y="415.032113" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="145.247268" y="414.700322" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="176.208306" y="414.808712" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="207.169343" y="415.503736" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="238.130381" y="416.677829" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="269.091418" y="416.322043" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="300.052455" y="416.2815" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="331.013493" y="416.364241" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="361.97453" y="415.247239" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="392.935568" y="416.78622" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="423.896605" y="416.686103" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="454.857643" y="416.223582" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="485.81868" y="417.390228" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="516.779718" y="415.991907" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="547.740755" y="415.279508" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="578.701793" y="415.702314" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="609.66283" y="416.082095" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="640.623868" y="416.173937" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="671.584905" y="415.884344" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="702.545943" y="416.157389" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="733.50698" y="416.115191" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="764.468018" y="416.686103" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="795.429055" y="415.967085" style="fill: #1f77b4; stroke: #1f77b4" />
     </g>
    </g>
    <g id="series--torch-eager" class="series">
-    <path d="M 83.325193 398.743479  L 114.286231 386.307139  L 145.247268 385.761533  L 176.208306 386.431329  L 207.169343 387.118511  L 238.130381 389.975698  L 269.091418 385.612506  L 300.052455 387.674052  L 331.013493 387.276646  L 361.97453 388.402631  L 392.935568 338.137333  L 423.896605 324.393692  L 454.857643 388.923399  L 485.81868 389.229733  L 516.779718 389.570012  L 547.740755 388.501982  L 578.701793 388.517713  L 609.66283 388.55083  L 640.623868 388.518541  L 671.584905 388.609613  L 702.545943 379.783049  L 733.50698 374.2028  L 764.468018 54.691293  L 795.429055 45.608899  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 83.325193 399.368433  L 114.286231 384.590909  L 145.247268 385.069979  L 176.208306 385.98923  L 207.169343 386.154712  L 238.130381 385.815474  L 269.091418 387.619226  L 300.052455 388.023829  L 331.013493 386.783543  L 361.97453 386.047149  L 392.935568 337.468313  L 423.896605 323.758146  L 454.857643 387.577855  L 485.81868 388.03293  L 516.779718 387.569581  L 547.740755 387.180699  L 578.701793 387.519109  L 609.66283 386.12989  L 640.623868 386.808365  L 671.584905 386.419483  L 702.545943 379.13001  L 733.50698 373.727029  L 764.468018 53.453563  L 795.429055 45.608899  " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="m9b8c54d372" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #ff7f0e" />
     </defs>
     <g clip-path="url(#pb49fc4c8d2)">
-     <use ns4:href="#m9b8c54d372" x="83.325193" y="398.743479" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="114.286231" y="386.307139" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="145.247268" y="385.761533" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="176.208306" y="386.431329" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="207.169343" y="387.118511" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="238.130381" y="389.975698" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="269.091418" y="385.612506" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="300.052455" y="387.674052" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="331.013493" y="387.276646" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="361.97453" y="388.402631" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="392.935568" y="338.137333" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="423.896605" y="324.393692" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="454.857643" y="388.923399" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="485.81868" y="389.229733" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="516.779718" y="389.570012" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="547.740755" y="388.501982" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="578.701793" y="388.517713" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="609.66283" y="388.55083" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="640.623868" y="388.518541" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="671.584905" y="388.609613" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="702.545943" y="379.783049" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="733.50698" y="374.2028" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="764.468018" y="54.691293" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="83.325193" y="399.368433" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="114.286231" y="384.590909" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="145.247268" y="385.069979" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="176.208306" y="385.98923" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="207.169343" y="386.154712" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="238.130381" y="385.815474" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="269.091418" y="387.619226" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="300.052455" y="388.023829" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="331.013493" y="386.783543" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="361.97453" y="386.047149" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="392.935568" y="337.468313" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="423.896605" y="323.758146" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="454.857643" y="387.577855" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="485.81868" y="388.03293" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="516.779718" y="387.569581" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="547.740755" y="387.180699" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="578.701793" y="387.519109" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="609.66283" y="386.12989" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="640.623868" y="386.808365" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="671.584905" y="386.419483" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="702.545943" y="379.13001" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="733.50698" y="373.727029" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="764.468018" y="53.453563" style="fill: #ff7f0e; stroke: #ff7f0e" />
      <use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
     </g>
    </g>
diff --git a/deformable_detr/impls/artifacts/benchmark/deformable_detr.jsonl b/deformable_detr/impls/artifacts/benchmark/deformable_detr.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..52c7930d88f40dd4da2a4cc2aa3b8068bb350deb
--- /dev/null
+++ b/deformable_detr/impls/artifacts/benchmark/deformable_detr.jsonl
@@ -0,0 +1,4 @@
+{"ts": "2025-10-31T20:13:50Z", "run": "c1c013d99d9f4c1199d0a550b8476fb2", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_Q100_H8_E256_L4_P4", "batch_size": 1, "num_queries": 100, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.3733269999629556, "p50": 3.3932979999917734, "p90": 3.4002180000243243, "mean": 3.393551400040451, "iqr": 0.010580999969533877, "raw_times": [3.3896370000547904, 3.4002180000243243, 3.3932979999917734, 3.3733269999629556, 3.411277000168411], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.4049870000671945, "peak_bytes": 5929472, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "deformable_detr_torch"}, "err": null}
+{"ts": "2025-10-31T20:13:51Z", "run": "c1c013d99d9f4c1199d0a550b8476fb2", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_Q300_H8_E256_L4_P4", "batch_size": 1, "num_queries": 300, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.99112300010529, "p50": 4.007804000139004, "p90": 4.020502999992459, "mean": 4.014501400024528, "iqr": 0.017490000118414173, "raw_times": [4.050064000011844, 4.020502999992459, 4.007804000139004, 4.003012999874045, 3.99112300010529], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 4.017783999870517, "peak_bytes": 15161856, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "deformable_detr_torch"}, "err": null}
+{"ts": "2025-10-31T20:13:51Z", "run": "c1c013d99d9f4c1199d0a550b8476fb2", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_Q100_H8_E256_L4_P4", "batch_size": 2, "num_queries": 100, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 4.004662999932407, "p50": 4.020202999981848, "p90": 4.030714000009539, "mean": 4.022331200030749, "iqr": 0.011850999953821884, "raw_times": [4.018863000055717, 4.004662999932407, 4.0372130001742335, 4.020202999981848, 4.030714000009539], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 4.032904000041526, "peak_bytes": 11958784, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "deformable_detr_torch"}, "err": null}
+{"ts": "2025-10-31T20:13:52Z", "run": "c1c013d99d9f4c1199d0a550b8476fb2", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_Q300_H8_E256_L4_P4", "batch_size": 2, "num_queries": 300, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 4.005022999990615, "p50": 4.020072999992408, "p90": 4.0240040000298904, "mean": 4.01746140000796, "iqr": 0.009850999958871398, "raw_times": [4.014153000071019, 4.005022999990615, 4.024053999955868, 4.0240040000298904, 4.020072999992408], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 4.024974000003567, "peak_bytes": 30977024, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "deformable_detr_torch"}, "err": null}
diff --git a/deformable_detr/impls/cells/benchmark.py b/deformable_detr/impls/cells/benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..66ccdf2085524240060089c8658a5256c484037b
--- /dev/null
+++ b/deformable_detr/impls/cells/benchmark.py
@@ -0,0 +1,118 @@
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "numpy",
+#     "torch==2.8.0",
+#     "kernels-benchmark-tools",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+
+
+def torch_deformable_detr(
+    value, spatial_shapes, level_start_index, sampling_locations, attention_weights, im2col_step=64
+):
+    """
+    PyTorch native reference implementation of multi-scale deformable attention.
+    Uses vectorized bilinear interpolation for reasonable performance.
+    """
+    bs, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
+    _, _, _, channels = value.shape
+
+    output = torch.zeros(bs, num_queries, num_heads, channels, device=value.device, dtype=value.dtype)
+
+    # Split value tensor by levels
+    value_list = value.split([int(h * w) for h, w in spatial_shapes.tolist()], dim=1)
+
+    # Iterate through each level (can't avoid this loop easily)
+    for level_idx in range(num_levels):
+        h, w = spatial_shapes[level_idx].tolist()
+        value_level = value_list[level_idx]  # (bs, h*w, num_heads, channels)
+
+        # Reshape to spatial grid: (bs, num_heads, channels, h, w)
+        value_spatial = value_level.reshape(bs, h, w, num_heads, channels).permute(0, 3, 4, 1, 2)
+
+        # Get sampling locations and weights for this level
+        # loc: (bs, num_queries, num_heads, num_points, 2)
+        loc = sampling_locations[:, :, :, level_idx, :, :]
+        # weight: (bs, num_queries, num_heads, num_points)
+        weight = attention_weights[:, :, :, level_idx, :]
+
+        # Convert normalized coordinates to pixel coordinates
+        # loc[..., 0] is x (width), loc[..., 1] is y (height)
+        x = loc[..., 0] * w - 0.5  # (bs, num_queries, num_heads, num_points)
+        y = loc[..., 1] * h - 0.5
+
+        # Get integer coordinates for bilinear interpolation
+        x0 = torch.floor(x).long()
+        y0 = torch.floor(y).long()
+        x1 = x0 + 1
+        y1 = y0 + 1
+
+        # Compute interpolation weights BEFORE clamping (important!)
+        lw = x - x0.float()  # weight for x direction
+        lh = y - y0.float()  # weight for y direction
+        hw = 1 - lw
+        hh = 1 - lh
+
+        # Create mask for valid sample locations
+        valid = (y > -1) & (x > -1) & (y < h) & (x < w)
+
+        # Create masks for each corner being in bounds
+        mask_tl = ((y0 >= 0) & (x0 >= 0)).unsqueeze(-1).float()
+        mask_tr = ((y0 >= 0) & (x1 <= w - 1)).unsqueeze(-1).float()
+        mask_bl = ((y1 <= h - 1) & (x0 >= 0)).unsqueeze(-1).float()
+        mask_br = ((y1 <= h - 1) & (x1 <= w - 1)).unsqueeze(-1).float()
+
+        # Clamp coordinates for safe indexing
+        x0_clamped = torch.clamp(x0, 0, w - 1)
+        x1_clamped = torch.clamp(x1, 0, w - 1)
+        y0_clamped = torch.clamp(y0, 0, h - 1)
+        y1_clamped = torch.clamp(y1, 0, h - 1)
+
+        # Bilinear interpolation weights for all 4 corners
+        w_tl = (hh * hw).unsqueeze(-1)  # top-left: (bs, num_queries, num_heads, num_points, 1)
+        w_tr = (hh * lw).unsqueeze(-1)  # top-right
+        w_bl = (lh * hw).unsqueeze(-1)  # bottom-left
+        w_br = (lh * lw).unsqueeze(-1)  # bottom-right
+
+        # Gather values from the 4 corners using advanced indexing
+        batch_idx = torch.arange(bs, device=value.device).view(bs, 1, 1, 1).expand(bs, num_queries, num_heads, num_points)
+        head_idx = torch.arange(num_heads, device=value.device).view(1, 1, num_heads, 1).expand(bs, num_queries, num_heads, num_points)
+
+        # Gather corner values with clamped indices, then apply corner masks
+        v_tl = value_spatial[batch_idx, head_idx, :, y0_clamped, x0_clamped] * mask_tl
+        v_tr = value_spatial[batch_idx, head_idx, :, y0_clamped, x1_clamped] * mask_tr
+        v_bl = value_spatial[batch_idx, head_idx, :, y1_clamped, x0_clamped] * mask_bl
+        v_br = value_spatial[batch_idx, head_idx, :, y1_clamped, x1_clamped] * mask_br
+
+        # Bilinear interpolation
+        sampled = w_tl * v_tl + w_tr * v_tr + w_bl * v_bl + w_br * v_br
+
+        # Apply valid mask (only accumulate if entire sample location is valid)
+        sampled = sampled * valid.unsqueeze(-1).float()
+
+        # Apply attention weights and sum over points
+        # weight: (bs, num_queries, num_heads, num_points)
+        # Expand weight: (bs, num_queries, num_heads, num_points, 1)
+        weighted_sampled = sampled * weight.unsqueeze(-1)
+
+        # Sum over points: (bs, num_queries, num_heads, channels)
+        output += weighted_sampled.sum(dim=3)
+
+    # Flatten last two dimensions to match kernel output
+    return output.reshape(bs, num_queries, num_heads * channels)
+
+
+run_benchmark(
+    kernel_type=KernelTypeEnum.DEFORMABLE_DETR,
+    impl_name="torch_eager",
+    impl_tags={"family": "pytorch", "backend": "eager"},
+    impl_func=torch_deformable_detr,
+    dtype="float32",
+)
\ No newline at end of file
diff --git a/deformable_detr/impls/cells/nv.py b/deformable_detr/impls/cells/nv.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9b2d5c1f9f3850dc1d3092240fd46853e7ed0c5
--- /dev/null
+++ b/deformable_detr/impls/cells/nv.py
@@ -0,0 +1,2 @@
+import subprocess
+print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
\ No newline at end of file
diff --git a/deformable_detr/impls/hf_kernels_deformable_detr.html b/deformable_detr/impls/hf_kernels_deformable_detr.html
new file mode 100644
index 0000000000000000000000000000000000000000..8203846442acfc0a17b0a7372d2971964aac9caf
--- /dev/null
+++ b/deformable_detr/impls/hf_kernels_deformable_detr.html
@@ -0,0 +1,4350 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>hf_kernels_deformable_detr</title>
+
+    <link rel="preconnect" href="https://fonts.googleapis.com">
+    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+    <link href="https://fonts.googleapis.com/css2?family=JetBrains+Mono:ital,wght@0,100..800;1,100..800&display=swap" rel="stylesheet">
+
+    <script>
+// Iframe-friendly navigation router
+        (function() {
+            const isIframe = window.self !== window.top;
+            if (!isIframe) return; // Only activate in iframe context
+
+            // On load: if hash points to a different page, navigate there
+            const hash = window.location.hash;
+            if (hash && hash.startsWith('#/')) {
+                const targetPath = hash.slice(2); // Remove '#/'
+                const currentPath = window.location.pathname.split('/').pop();
+
+                // Only navigate if we're not already on the target page
+                if (targetPath !== currentPath) {
+                    window.location.href = targetPath;
+                    return; // Stop execution, we're navigating away
+                }
+            }
+
+            // Intercept all link clicks for hash-based navigation
+            document.addEventListener('click', function(e) {
+                const link = e.target.closest('a');
+                if (!link) return;
+
+                const href = link.getAttribute('href');
+
+                // Skip external links, anchors, and javascript: links
+                if (!href || href.startsWith('#') || href.startsWith('http') || href.startsWith('javascript:')) {
+                    return;
+                }
+
+                e.preventDefault();
+
+                // Convert relative/absolute path to hash-based navigation
+                const url = new URL(href, window.location.href);
+                let fullPath = url.pathname;
+
+                // Remove leading slash if present for cleaner paths
+                if (fullPath.startsWith('/')) {
+                    fullPath = fullPath.slice(1);
+                }
+
+                // Update parent URL hash
+                window.location.hash = '#/' + fullPath;
+
+                // For HTML files, navigate within iframe
+                if (fullPath.endsWith('.html') || fullPath.endsWith('/')) {
+                    const pathParts = fullPath.split('/').filter(p => p);
+                    const targetFile = pathParts[pathParts.length - 1] || 'index.html';
+                    window.location.href = targetFile;
+                } else {
+                    // For non-HTML files (raw .py, etc), open directly
+                    window.open(href, '_blank');
+                }
+            });
+        })();
+
+        // Apply theme and widget visibility immediately to prevent flicker
+        (function() {
+            const configTheme = 'dark';
+            const hasConfigUi = false;
+            const configUi = hasConfigUi ? 'None' : null;
+            const hasWidgetsConfig = false;
+            const widgetsOn = hasWidgetsConfig ? false : true;
+            let theme;
+            if (configTheme === 'auto') {
+                theme = window.matchMedia('(prefers-color-scheme: dark)').matches ? 'dark' : 'light';
+            } else {
+                theme = localStorage.getItem('uvnote-theme') || configTheme;
+            }
+            document.documentElement.setAttribute('data-theme', theme);
+
+            // Initialize UI theme (css theme)
+            let ui = hasConfigUi ? configUi : (localStorage.getItem('uvnote-ui') || 'default');
+            if (ui !== 'default' && ui !== 'none' && ui !== 'monocolor') { ui = 'default'; }
+            document.documentElement.setAttribute('data-ui', ui);
+
+            // Apply widgets visibility
+            document.documentElement.setAttribute('data-widgets', widgetsOn ? 'on' : 'off');
+        })();
+    </script>
+    <style>
+:root[data-theme="light"] {
+    --bg-primary: #ffffff;
+    --bg-secondary: #f6f8fa;
+    --bg-tertiary: #f8f9fa;
+    --bg-code: #f8f9fa;
+    --bg-error: #fdf2f2;
+    --bg-artifact: #e6f3ff;
+    --bg-artifact-hover: #d0e7ff;
+
+    --text-primary: #333;
+    --text-secondary: #656d76;
+    --text-error: #c53030;
+    --text-link: #0969da;
+
+    --border-primary: #e1e5e9;
+    --border-error: #e53e3e;
+    --border-cell-failed: #d73a49;
+
+    --shadow: rgba(0, 0, 0, 0.1);
+}
+
+:root[data-theme="dark"] {
+    --bg-primary: #0a0a0a;
+    --bg-secondary: #121212;
+    --bg-tertiary: #181818;
+    --bg-code: #0d0d0d;
+    --bg-error: #1a0f0f;
+    --bg-artifact: #151515;
+    --bg-artifact-hover: #1a1a1a;
+
+    --text-primary: #e0e0e0;
+    --text-secondary: #888888;
+    --text-error: #ff6b6b;
+    --text-link: #64b5f6;
+
+    --border-primary: #2a2a2a;
+    --border-error: #ff6b6b;
+    --border-cell-failed: #ff6b6b;
+
+    --shadow: rgba(255, 255, 255, 0.05);
+}
+
+/* Monocolor UI theme: black/white background, all text/borders single blue */
+:root[data-ui="monocolor"] {
+    --mono-color: #0a66ff;
+}
+
+:root[data-ui="monocolor"][data-theme="light"] {
+    --bg-primary: #ffffff;
+}
+
+:root[data-ui="monocolor"][data-theme="dark"] {
+    --bg-primary: #000000;
+}
+
+:root[data-ui="monocolor"] {
+    --bg-secondary: var(--bg-primary);
+    --bg-tertiary: var(--bg-primary);
+    --bg-code: var(--bg-primary);
+    --bg-error: var(--bg-primary);
+    --bg-artifact: var(--bg-primary);
+    --bg-artifact-hover: var(--bg-primary);
+
+    --text-primary: var(--mono-color);
+    --text-secondary: var(--mono-color);
+    --text-error: var(--mono-color);
+    --text-link: var(--mono-color);
+
+    --border-primary: var(--mono-color);
+    --border-error: var(--mono-color);
+    --border-cell-failed: var(--mono-color);
+
+    --shadow: none;
+}
+
+:root[data-ui="monocolor"] a {
+    color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .menu-button,
+:root[data-ui="monocolor"] .theme-toggle,
+:root[data-ui="monocolor"] .reset-toggle,
+:root[data-ui="monocolor"] .back-button {
+    background: var(--bg-primary);
+    color: var(--mono-color);
+    border-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .menu-button:hover,
+:root[data-ui="monocolor"] .theme-toggle:hover,
+:root[data-ui="monocolor"] .reset-toggle:hover,
+:root[data-ui="monocolor"] .back-button:hover {
+    background: var(--bg-primary);
+    color: var(--mono-color);
+    border-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .menu-dropdown {
+    background: var(--bg-primary);
+    border-color: var(--mono-color);
+    box-shadow: none;
+}
+
+:root[data-ui="monocolor"] .menu-item {
+    color: var(--mono-color);
+    border-bottom-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .system-info {
+    background: var(--bg-primary);
+    border-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .cell {
+    border-color: var(--mono-color);
+    background: var(--bg-primary);
+}
+
+:root[data-ui="monocolor"] .cell-header {
+    background: var(--bg-primary);
+    border-bottom-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .artifact {
+    background: var(--bg-primary);
+    border-color: var(--mono-color);
+    color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .artifact:hover {
+    background: var(--bg-primary);
+}
+
+:root[data-ui="monocolor"] .artifact-preview img,
+:root[data-ui="monocolor"] .artifact-preview svg {
+    border-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .status-widget {
+    background: var(--bg-primary);
+    border-color: var(--mono-color);
+    color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .minimap,
+:root[data-ui="monocolor"] .file-explorer,
+:root[data-ui="monocolor"] .tools-widget {
+    background: var(--bg-primary);
+    border-color: var(--mono-color);
+    color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .cell-code {
+    background: var(--bg-primary);
+    border-bottom-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .tools-title,
+:root[data-ui="monocolor"] .file-explorer-section-title,
+:root[data-ui="monocolor"] .minimap-title {
+    color: var(--mono-color);
+    border-bottom-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .tool-button {
+    background: var(--bg-primary);
+    border-color: var(--mono-color);
+    color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .tool-button.active {
+    border-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .file-explorer-item,
+:root[data-ui="monocolor"] .minimap-item {
+    color: var(--mono-color);
+}
+
+/* Force Pygments code to mono blue on mono bg */
+:root[data-ui="monocolor"] .highlight {
+    background: var(--bg-primary) !important;
+    color: var(--mono-color) !important;
+}
+
+:root[data-ui="monocolor"] .highlight *,
+:root[data-ui="monocolor"] .highlight .hll {
+    color: var(--mono-color) !important;
+    background: transparent !important;
+    border-color: var(--mono-color) !important;
+}
+
+/* Default code font + metrics (overridable via frontmatter) */
+:root {
+    --code-font-size: 0.95rem;
+    --code-line-height: 1.5;
+    --code-pad-y: 0.75rem;
+}
+
+/* Minimal UI theme overrides base variables for a flatter, 90s look */
+:root[data-ui="none"] {
+    --bg-primary: #ffffff;
+    --bg-secondary: transparent;
+    --bg-tertiary: transparent;
+    --bg-code: #f9f9f9;
+    --bg-error: #fff0f0;
+    --bg-artifact: #f0f7ff;
+    --bg-artifact-hover: #e5f1ff;
+
+    --text-primary: #000000;
+    --text-secondary: #222222;
+    --text-error: #a00000;
+    --text-link: #0000ee;
+
+    --border-primary: #cccccc;
+    --border-error: #cc0000;
+    --border-cell-failed: #cc0000;
+
+    --shadow: none;
+}
+
+html {
+    overscroll-behavior: none;
+}
+
+body {
+    font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace;
+    line-height: 1.4;
+    max-width: 1000px;
+    margin: 0 auto;
+    padding: 15px;
+    color: var(--text-primary);
+    background: var(--bg-primary);
+    transition: background-color 0.2s ease, color 0.2s ease;
+    overscroll-behavior: none;
+}
+
+/* Minimal "none" UI theme overrides */
+:root[data-ui="none"] body {
+    font-family: 'Times New Roman', Times, serif;
+    line-height: 1.5;
+    max-width: 860px;
+    padding: 12px;
+    background: #ffffff;
+    color: #000000;
+    transition: none;
+}
+
+/* Two panel layout removed */
+
+.controls {
+    position: fixed;
+    top: 20px;
+    right: 20px;
+    display: flex;
+    flex-direction: column;
+    align-items: flex-end;
+    gap: 0.25rem;
+    z-index: 1000;
+}
+
+.controls-buttons {
+    display: flex;
+    gap: 0.5rem;
+}
+
+.menu-button {
+    position: relative;
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    padding: 8px 12px;
+    border-radius: 2px;
+    color: var(--text-secondary);
+    cursor: pointer;
+    font-family: inherit;
+    font-size: 0.9rem;
+    user-select: none;
+}
+
+/* Keep default control styling when widgets are enabled, even in minimal UI */
+:root[data-ui="none"][data-widgets="on"] .menu-button,
+:root[data-ui="none"][data-widgets="on"] .theme-toggle,
+:root[data-ui="none"][data-widgets="on"] .reset-toggle,
+:root[data-ui="none"][data-widgets="on"] .back-button {
+    background: #f6f6f6;
+    border: 1px solid #cccccc;
+    color: #222222;
+}
+
+.menu-button:hover {
+    color: var(--text-primary);
+    background: var(--bg-tertiary);
+}
+
+/* Controls state indicator (top-right) */
+/* Status widget (bottom-right) */
+.status-widget {
+    position: fixed;
+    right: 20px;
+    bottom: 20px;
+    width: auto;
+    max-width: 260px;
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    border-radius: 2px;
+    padding: 6px 8px;
+    font-size: 0.8rem;
+    color: var(--text-secondary);
+    z-index: 100;
+}
+
+.status-widget strong {
+    color: var(--text-primary);
+}
+
+:root[data-ui="none"][data-widgets="on"] .status-widget {
+    background: #f6f6f6;
+    border-color: #ccc;
+    color: #222;
+}
+
+:root[data-ui="none"][data-widgets="on"] .menu-button:hover,
+:root[data-ui="none"][data-widgets="on"] .theme-toggle:hover,
+:root[data-ui="none"][data-widgets="on"] .reset-toggle:hover,
+:root[data-ui="none"][data-widgets="on"] .back-button:hover {
+    background: #ededed;
+    border-color: #bbbbbb;
+    color: #000000;
+}
+
+.menu-dropdown {
+    position: absolute;
+    top: 100%;
+    right: 0;
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    border-radius: 4px;
+    box-shadow: 0 4px 12px var(--shadow);
+    min-width: 160px;
+    opacity: 0;
+    visibility: hidden;
+    transform: translateY(-8px);
+    transition: all 0.2s ease;
+    z-index: 1001;
+    margin-top: 4px;
+}
+
+:root[data-ui="none"][data-widgets="on"] .menu-dropdown {
+    background: #ffffff;
+    border: 1px solid #cccccc;
+    box-shadow: none;
+}
+
+.menu-button.active .menu-dropdown {
+    opacity: 1;
+    visibility: visible;
+    transform: translateY(0);
+}
+
+.menu-item {
+    display: block;
+    padding: 8px 12px;
+    color: var(--text-secondary);
+    text-decoration: none;
+    font-size: 0.85rem;
+    border-bottom: 1px solid var(--border-primary);
+    cursor: pointer;
+}
+
+:root[data-ui="none"] .menu-item {
+    color: #000;
+    border-bottom: 1px solid #eee;
+}
+
+.menu-item:last-child {
+    border-bottom: none;
+}
+
+.menu-item:hover {
+    background: var(--bg-tertiary);
+    color: var(--text-primary);
+}
+
+.menu-checkbox {
+    display: inline-block;
+    width: 16px;
+    font-family: monospace;
+    color: var(--text-link);
+}
+
+.theme-toggle,
+.reset-toggle,
+.back-button {
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    padding: 8px 12px;
+    border-radius: 4px;
+    color: var(--text-secondary);
+    cursor: pointer;
+    font-family: inherit;
+    font-size: 0.9rem;
+    user-select: none;
+}
+
+.back-button {
+    text-decoration: none;
+    display: inline-block;
+}
+
+.theme-toggle:hover,
+.reset-toggle:hover,
+.back-button:hover {
+    color: var(--text-primary);
+    background: var(--bg-tertiary);
+}
+
+.system-info {
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    border-radius: 4px;
+    padding: 8px 12px;
+    margin-bottom: 16px;
+    font-size: 0.85em;
+    color: var(--text-secondary);
+}
+
+.system-info-header {
+    font-weight: 600;
+    color: var(--text-primary);
+    margin-bottom: 2px;
+}
+
+.system-info-content {
+    font-family: monospace;
+}
+
+.theme-toggle,
+.reset-toggle {
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    border-radius: 2px;
+    /* padding: 0.4rem 0.6rem; */
+    cursor: pointer;
+    font-family: inherit;
+    font-size: 0.8rem;
+    color: var(--text-secondary);
+    user-select: none;
+    transition: all 0.2s ease;
+    text-transform: lowercase;
+    letter-spacing: 0;
+}
+
+.theme-toggle:hover,
+.reset-toggle:hover {
+    background: var(--bg-tertiary);
+    border-color: var(--text-secondary);
+    color: var(--text-primary);
+}
+
+.minimap {
+    position: fixed;
+    bottom: 20px;
+    right: 20px;
+    width: 220px;
+    max-height: 400px;
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    border-radius: 2px;
+    padding: 0.5rem;
+    font-size: 0.7rem;
+    overflow-y: auto;
+    z-index: 100;
+    opacity: 0.9;
+    transition: opacity 0.2s ease;
+}
+
+/* Hide widgets and controls when disabled via frontmatter */
+:root[data-widgets="off"] .controls,
+:root[data-widgets="off"] .minimap,
+:root[data-widgets="off"] .file-explorer,
+:root[data-widgets="off"] .tools-widget,
+:root[data-widgets="off"] .status-widget {
+    display: none !important;
+}
+
+.file-explorer {
+    position: fixed;
+    bottom: 20px;
+    /* default; JS will stack */
+    right: 20px;
+    left: auto;
+    top: auto;
+    width: 220px;
+    max-height: 400px;
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    border-radius: 2px;
+    padding: 0.5rem;
+    font-size: 0.7rem;
+    overflow-y: auto;
+    z-index: 100;
+    opacity: 0.9;
+    transition: opacity 0.2s ease;
+}
+
+/* Drawing overlay */
+.draw-overlay {
+    position: fixed;
+    top: 0;
+    left: 0;
+    width: 100vw;
+    height: 100vh;
+    z-index: 80;
+    /* under widgets (100) and controls (1000) */
+    display: block;
+    pointer-events: none;
+    /* enabled only when a tool is active */
+}
+
+/* Tools widget */
+.tools-widget {
+    position: fixed;
+    bottom: 20px;
+    /* default; JS will stack */
+    right: 20px;
+    left: auto;
+    top: auto;
+    width: 220px;
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    border-radius: 2px;
+    padding: 0.5rem;
+    font-size: 0.7rem;
+    z-index: 100;
+    opacity: 0.95;
+}
+
+.tools-title {
+    font-weight: bold;
+    color: var(--text-secondary);
+    margin-bottom: 0.5rem;
+    padding-bottom: 0.25rem;
+    border-bottom: 1px solid var(--border-primary);
+    cursor: grab;
+    user-select: none;
+}
+
+.tools-row {
+    display: flex;
+    gap: 0.4rem;
+    flex-wrap: wrap;
+}
+
+.tool-button {
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-primary);
+    border-radius: 2px;
+    padding: 0.25rem 0.4rem;
+    cursor: pointer;
+    color: var(--text-secondary);
+    font-family: inherit;
+    font-size: 0.75rem;
+    user-select: none;
+}
+
+.tool-button:hover {
+    color: var(--text-primary);
+}
+
+.tool-button.active {
+    color: var(--text-primary);
+    border-color: var(--text-secondary);
+    background: var(--bg-secondary);
+}
+
+.minimap:hover,
+.file-explorer:hover {
+    opacity: 1;
+}
+
+.minimap-title {
+    font-weight: bold;
+    color: var(--text-secondary);
+    margin-bottom: 0.5rem;
+    padding-bottom: 0.25rem;
+    border-bottom: 1px solid var(--border-primary);
+    cursor: grab;
+    /* drag handle */
+    user-select: none;
+}
+
+.minimap-item {
+    display: block;
+    color: var(--text-secondary);
+    text-decoration: none;
+    padding: 0.15rem 0;
+    border-left: 2px solid transparent;
+    padding-left: 0.5rem;
+    transition: all 0.2s ease;
+    cursor: pointer;
+}
+
+.minimap-item:hover {
+    color: var(--text-primary);
+    border-left-color: var(--text-secondary);
+}
+
+.minimap-item.active {
+    color: var(--text-primary);
+    border-left-color: var(--text-link);
+}
+
+.minimap-heading {
+    font-weight: normal;
+}
+
+.minimap-heading.h1 {
+    padding-left: 0.5rem;
+}
+
+.minimap-heading.h2 {
+    padding-left: 1rem;
+}
+
+.minimap-heading.h3 {
+    padding-left: 1.5rem;
+}
+
+.minimap-heading.h4 {
+    padding-left: 2rem;
+}
+
+.minimap-heading.h5 {
+    padding-left: 2.5rem;
+}
+
+.minimap-heading.h6 {
+    padding-left: 3rem;
+}
+
+.minimap-cell {
+    color: var(--text-link);
+    opacity: 0.8;
+    font-style: italic;
+}
+
+.minimap-cell:hover {
+    opacity: 1;
+}
+
+.file-explorer-title {
+    font-weight: bold;
+    color: var(--text-secondary);
+    margin-bottom: 0.5rem;
+    padding-bottom: 0.25rem;
+    border-bottom: 1px solid var(--border-primary);
+    cursor: grab;
+    /* drag handle */
+    user-select: none;
+}
+
+.file-explorer-section {
+    margin-bottom: 0.75rem;
+}
+
+.file-explorer-section-title {
+    font-weight: bold;
+    color: var(--text-secondary);
+    font-size: 0.65rem;
+    margin-bottom: 0.25rem;
+    text-transform: uppercase;
+    letter-spacing: 0.5px;
+}
+
+.file-explorer-item {
+    display: block;
+    color: var(--text-secondary);
+    text-decoration: none;
+    padding: 0.1rem 0;
+    margin-left: 0.5rem;
+    transition: color 0.2s ease;
+    cursor: pointer;
+    font-family: monospace;
+}
+
+.file-explorer-item:hover {
+    color: var(--text-primary);
+}
+
+.file-explorer-item.script {
+    color: var(--text-link);
+}
+
+.file-explorer-item.artifact {
+    color: var(--text-secondary);
+    opacity: 0.8;
+}
+
+
+/* Hide widgets on smaller screens */
+@media (max-width: 768px) {
+
+    .minimap,
+    .file-explorer,
+    .tools-widget {
+        display: none;
+    }
+}
+
+.cell {
+    margin: 1rem 0;
+    border: 1px solid var(--border-primary);
+    border-radius: 2px;
+    overflow: hidden;
+    background: var(--bg-secondary);
+}
+
+:root[data-ui="none"] .cell {
+    margin: 1em 0;
+    border: none;
+    background: transparent;
+}
+
+.cell-header {
+    background: var(--bg-secondary);
+    padding: 0.5rem 1rem;
+    border-bottom: 1px solid var(--border-primary);
+    font-family: inherit;
+    font-size: 0.85rem;
+}
+
+:root[data-ui="none"] .cell-header {
+    background: transparent;
+    border: none;
+    padding: 0;
+    font-weight: bold;
+}
+
+:root[data-ui="none"] .cell-content {
+    padding: 0;
+}
+
+:root[data-ui="none"] .copy-button,
+:root[data-ui="none"] .collapse-indicators,
+:root[data-ui="none"] .cell-meta,
+:root[data-ui="none"] .cell-outputs-header {
+    display: none !important;
+}
+
+:root[data-ui="none"] pre,
+:root[data-ui="none"] code {
+    font-family: Menlo, Monaco, 'Courier New', monospace;
+}
+
+:root[data-ui="none"] .code-content pre {
+    background: #f9f9f9;
+    border: 1px solid #ddd;
+    padding: 8px;
+}
+
+:root[data-ui="none"] .output {
+    background: transparent;
+    border: none;
+    padding: 0.25em 0;
+}
+
+color: var(--text-secondary);
+cursor: pointer;
+user-select: none;
+transition: background-color 0.2s ease;
+}
+
+.cell-header:hover {
+    background: var(--bg-tertiary);
+}
+
+.collapse-indicators {
+    color: var(--text-secondary);
+    font-size: 0.8rem;
+    opacity: 0.7;
+}
+
+.collapse-indicators span:hover {
+    color: var(--text-primary);
+    opacity: 1;
+}
+
+.cell-code {
+    display: block;
+    background: var(--bg-code);
+}
+
+.cell-code.collapsed {
+    display: none;
+}
+
+.cell-code pre {
+    margin: 0;
+    padding: 0.75rem;
+    background: var(--bg-code);
+    overflow-x: auto;
+    color: var(--text-primary);
+}
+
+.cell-output {
+    padding: 0.75rem;
+    /* background: var(--bg-primary); */
+    background: var(--bg-secondary);
+}
+
+.cell-output.collapsed {
+    display: none;
+}
+
+.cell-stdout {
+    background: var(--bg-tertiary);
+    padding: 0.75rem;
+    border-radius: 1px;
+    /* margin: 0.25rem 0; */
+    font-family: inherit;
+    font-size: 0.9rem;
+    white-space: pre-wrap;
+    color: var(--text-primary);
+}
+
+.cell-stdout {
+    background: var(--bg-tertiary);
+    padding: 0.75rem;
+    border-radius: 1px;
+    font-family: inherit;
+    font-size: 0.9rem;
+    color: var(--text-primary);
+
+    /* key bits */
+    overflow: auto;
+    /* show scrollbars when needed */
+    max-width: 100%;
+    /* respects whatever layout width you give it */
+}
+
+.cell-stdout .stdout-text {
+    margin: 0;
+    /* reset pre default margin */
+    white-space: pre;
+    /* keep line breaks, NO wrapping */
+    display: inline-block;
+    /* shrink-to-content */
+    min-width: max-content;
+    /* allow very long lines to define intrinsic width */
+    font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
+    tab-size: 2;
+}
+
+.cell-stderr {
+    background: var(--bg-error);
+    border-left: 2px solid var(--border-error);
+    padding: 1rem;
+    margin: 0.5rem 0;
+    font-family: inherit;
+    font-size: 0.9rem;
+    color: var(--text-error);
+    white-space: pre-wrap;
+}
+
+.uv-install-logs {
+    margin: 0.5rem 0;
+}
+
+.uv-logs-header {
+    cursor: pointer;
+    padding: 0.75rem;
+    border-left: 3px solid var(--border-color);
+    font-family: inherit;
+    font-size: 0.85rem;
+    color: var(--text-secondary);
+    user-select: none;
+}
+
+.uv-logs-content {
+    background: var(--bg-secondary);
+    padding: 1rem;
+    border-left: 3px solid var(--border-color);
+    white-space: pre-wrap;
+    font-family: monospace;
+    font-size: 0.85rem;
+    color: var(--text-secondary);
+    overflow-x: auto;
+}
+
+.cell-artifacts {
+    margin: 1rem 0;
+}
+
+.cell-artifacts h4 {
+    margin: 0 0 0.5rem 0;
+    color: var(--text-secondary);
+    font-size: 0.9rem;
+}
+
+.artifact {
+    display: inline-block;
+    background: var(--bg-artifact);
+    padding: 0.25rem 0.5rem;
+    border-radius: 1px;
+    margin: 0.25rem 0.5rem 0.25rem 0;
+    font-family: inherit;
+    font-size: 0.8rem;
+    color: var(--text-link);
+    text-decoration: none;
+    transition: background-color 0.2s ease;
+    border: 1px solid var(--border-primary);
+}
+
+.artifact:hover {
+    background: var(--bg-artifact-hover);
+}
+
+.artifact-preview {
+    margin-top: 1rem;
+}
+
+.artifact-preview img {
+    max-width: 100%;
+    height: auto;
+    border: 1px solid var(--border-primary);
+    border-radius: 1px;
+}
+
+.artifact-preview svg {
+    max-width: 100%;
+    height: auto;
+    border: 1px solid var(--border-primary);
+    border-radius: 1px;
+    display: block;
+}
+
+/* Style SVG text elements */
+.artifact-preview svg g {
+    fill: var(--text-primary) !important;
+}
+
+/* Auto-theme SVG elements */
+.artifact-preview svg {
+    background: transparent;
+}
+
+/* Invert SVG images in dark mode */
+:root[data-theme="dark"] .artifact-preview img[src$=".svg"] {
+    filter: invert(0.9) hue-rotate(180deg);
+}
+
+/* Keep SVG images readable in monocolor mode */
+:root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] {
+    filter: none;
+}
+
+/* CSV table styling */
+.artifact-csv {
+    margin-top: 1rem;
+    overflow-x: auto;
+}
+
+.csv-table {
+    width: 100%;
+    border-collapse: collapse;
+    font-size: 0.9rem;
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    border-radius: 1px;
+}
+
+.csv-table th,
+.csv-table td {
+    padding: 0.5rem 0.75rem;
+    text-align: left;
+    border: 1px solid var(--border-primary);
+}
+
+.csv-table th {
+    background: var(--bg-tertiary);
+    font-weight: 600;
+    color: var(--text-primary);
+}
+
+.csv-table tbody tr:hover {
+    background: var(--bg-artifact-hover);
+}
+
+.artifact-csv-error {
+    margin-top: 1rem;
+    padding: 1rem;
+    background: var(--bg-error);
+    color: var(--text-error);
+    border: 1px solid var(--border-error);
+    border-radius: 1px;
+}
+
+.cell-failed {
+    border-color: var(--border-cell-failed);
+}
+
+.cell-failed .cell-header {
+    background: var(--bg-error);
+    color: var(--text-error);
+}
+
+.cell-commented {
+    opacity: 0.6;
+    border-style: dashed;
+}
+
+.cell-commented .cell-header {
+    background: var(--bg-secondary);
+    color: var(--text-secondary);
+    font-style: italic;
+}
+
+.run-btn {
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-primary);
+    padding: 2px 6px;
+    border-radius: 2px;
+    color: var(--text-secondary);
+    cursor: pointer;
+    font-size: 0.75em;
+    font-family: inherit;
+    margin-left: 4px;
+}
+
+.run-btn:hover {
+    color: var(--text-primary);
+    background: var(--bg-primary);
+}
+
+.run-btn:disabled {
+    opacity: 0.6;
+    cursor: not-allowed;
+}
+
+.copy-btn {
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-primary);
+    padding: 2px 6px;
+    border-radius: 2px;
+    color: var(--text-secondary);
+    cursor: pointer;
+    font-size: 0.75em;
+    font-family: inherit;
+    margin-left: 4px;
+}
+
+.copy-btn:hover {
+    color: var(--text-primary);
+    background: var(--bg-primary);
+}
+
+.copy-btn:disabled {
+    opacity: 0.6;
+    cursor: not-allowed;
+}
+
+.copy-btn.copied {
+    color: #4caf50;
+    background: var(--bg-primary);
+    border-color: #4caf50;
+    transition: all 0.2s ease;
+}
+
+.raw-btn {
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-primary);
+    padding: 2px 6px;
+    border-radius: 2px;
+    color: var(--text-secondary);
+    cursor: pointer;
+    font-size: 0.75em;
+    font-family: inherit;
+    margin-left: 4px;
+    text-decoration: none;
+    display: inline-block;
+}
+
+.raw-btn:hover {
+    color: var(--text-primary);
+    background: var(--bg-primary);
+    text-decoration: none;
+}
+
+.github-btn {
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-primary);
+    padding: 2px 6px;
+    border-radius: 2px;
+    color: var(--text-secondary);
+    cursor: pointer;
+    font-size: 0.75em;
+    font-family: inherit;
+    margin-left: 4px;
+    text-decoration: none;
+    display: inline-block;
+}
+
+.github-btn:hover {
+    color: var(--text-primary);
+    background: var(--bg-primary);
+    text-decoration: none;
+}
+
+.hf-btn {
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-primary);
+    padding: 2px 6px;
+    border-radius: 2px;
+    color: var(--text-secondary);
+    cursor: pointer;
+    font-size: 0.75em;
+    font-family: inherit;
+    margin-left: 4px;
+    text-decoration: none;
+    display: inline-block;
+}
+
+.hf-btn:hover {
+    color: var(--text-primary);
+    background: var(--bg-primary);
+    text-decoration: none;
+}
+
+.output-stale {
+    opacity: 0.5;
+    position: relative;
+}
+
+.output-stale::after {
+    content: '⏳ updating...';
+    position: absolute;
+    top: 8px;
+    right: 8px;
+    background: var(--bg-secondary);
+    padding: 4px 8px;
+    border-radius: 2px;
+    font-size: 0.75em;
+    color: var(--text-secondary);
+    border: 1px solid var(--border-primary);
+}
+
+h1,
+h2,
+h3,
+h4,
+h5,
+h6 {
+    margin-top: 1.5rem;
+    margin-bottom: 0.75rem;
+    color: var(--text-primary);
+}
+
+h1 {
+    margin-top: 0;
+    margin-bottom: 1rem;
+}
+
+p {
+    margin: 0.75rem 0;
+    color: var(--text-primary);
+}
+
+a {
+    color: var(--text-link);
+}
+
+img {
+    max-width: 100%;
+    height: auto;
+    border-radius: 1px;
+    box-shadow: none;
+}
+
+pre,
+code {
+    font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace;
+    font-size: var(--code-font-size);
+}
+
+.code-wrap {
+    position: relative;
+}
+
+.code-line-highlight {
+    display: none;
+    position: absolute;
+    left: 0;
+    right: 0;
+    height: 1.5em;
+    background: rgba(255, 235, 170, 0.35);
+    pointer-events: none;
+    border-left: 3px solid #f4c542;
+}
+
+.line-number {
+    cursor: pointer;
+    text-decoration: none;
+    color: var(--text-secondary);
+    padding: 0 0.25rem;
+}
+
+.line-number.selected {
+    background: rgba(255, 235, 170, 0.4);
+    color: var(--text-primary);
+}
+
+/* Line numbers */
+.highlight-with-lines {
+    display: flex;
+}
+
+.line-numbers {
+    background: var(--bg-tertiary);
+    padding: var(--code-pad-y) 0.5rem;
+    font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace;
+    font-size: var(--code-font-size);
+    line-height: var(--code-line-height);
+    color: var(--text-secondary);
+    user-select: none;
+    text-align: right;
+    border-right: 1px solid var(--border-primary);
+}
+
+.line-numbers .line-number {
+    display: block;
+    line-height: var(--code-line-height);
+}
+
+.highlight-with-lines .highlight {
+    flex: 1;
+}
+
+.highlight .hll {
+    background-color: transparent;
+}
+
+/* don't conflict with our highlight */
+.highlight pre {
+    white-space: pre;
+    margin: 0;
+    padding: var(--code-pad-y) 0.75rem;
+    line-height: var(--code-line-height);
+}
+
+/* Collapsed code styling */
+.cell-code.collapsed {
+    display: none;
+}
+
+.cell-code.expanded {
+    display: block;
+}
+
+    {
+    % if config.collapse_code %
+}
+
+.cell-code {
+    display: none;
+}
+
+    {
+    % else %
+}
+
+.cell-code {
+    display: block;
+    border-bottom: 1px solid var(--border-primary);
+}
+
+    {
+    % endif %
+}
+
+    {
+        {
+        pygments_css
+    }
+}
+
+/* Ensure our code metrics override Pygments defaults */
+.highlight pre {
+    white-space: pre;
+    margin: 0;
+    padding: var(--code-pad-y) 0.75rem !important;
+    line-height: var(--code-line-height) !important;
+    font-size: var(--code-font-size) !important;
+    font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace !important;
+    border: none;
+}
+
+.line-numbers {
+    line-height: var(--code-line-height) !important;
+}
+
+.line-numbers .line-number {
+    line-height: var(--code-line-height) !important;
+}
+
+/* Custom CSS from frontmatter */
+    {
+        {
+        config.custom_css
+    }
+}
+
+    {
+    # Override code font size from frontmatter (accept number as px) #
+}
+
+    {
+    % if config.code_font_size is not none %
+}
+
+    {
+    % if config.code_font_size is string %
+}
+
+:root {
+    --code-font-size: {
+            {
+            config.code_font_size
+        }
+    }
+
+    ;
+}
+
+    {
+    % else %
+}
+
+:root {
+    --code-font-size: {
+            {
+            config.code_font_size
+        }
+    }
+
+    px;
+}
+
+    {
+    % endif %
+}
+
+    {
+    % endif %
+}
+
+/* Cursor for tools */
+body[data-tool="arrow"] .main-content {
+    cursor: url('data:image/svg+xml;utf8,<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="%23e53935" stroke-width="2"><path d="M12 19l7-7 3 3-7 7-3-3z"/><path d="M18 13l-1.5-7.5L2 2l3.5 14.5L13 18l5-5z"/><path d="M2 2l7.586 7.586"/><circle cx="11" cy="11" r="2"/></svg>') 12 12, crosshair;
+}
+
+body[data-tool="pen"] .main-content {
+    cursor: url('data:image/svg+xml;utf8,<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="%23e53935" stroke-width="2"><path d="M12 19l7-7 3 3-7 7-3-3z"/><path d="M18 13l-1.5-7.5L2 2l3.5 14.5L13 18l5-5z"/><circle cx="4" cy="20" r="2" fill="%23e53935"/></svg>') 4 20, pointer;
+}
+
+body[data-tool="eraser"] .main-content {
+    cursor: url('data:image/svg+xml;utf8,<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="%23e53935" stroke-width="2"><path d="M20 20H7l-7-7 7-7h13v14z"/><path d="M13 13l7-7"/><path d="M13 13L9 9"/></svg>') 12 12, auto;
+}
+
+/* Color picker styles */
+.tools-section-title {
+    font-weight: bold;
+    color: var(--text-secondary);
+    font-size: 0.65rem;
+    margin: 0.75rem 0 0.5rem 0;
+    text-transform: uppercase;
+    letter-spacing: 0.5px;
+}
+
+.color-row {
+    display: grid;
+    grid-template-columns: repeat(6, 1fr);
+    gap: 0.25rem;
+    margin-bottom: 0.5rem;
+}
+
+.color-swatch {
+    width: 18px;
+    height: 18px;
+    border: 2px solid var(--border-primary);
+    border-radius: 3px;
+    cursor: pointer;
+    transition: all 0.2s ease;
+    position: relative;
+}
+
+.color-swatch:hover {
+    transform: scale(1.1);
+    border-color: var(--text-secondary);
+}
+
+.color-swatch.selected {
+    border-color: var(--text-primary);
+    box-shadow: 0 0 0 2px var(--text-link);
+}
+
+.color-swatch.selected::after {
+    content: '✓';
+    position: absolute;
+    top: 50%;
+    left: 50%;
+    transform: translate(-50%, -50%);
+    color: white;
+    font-size: 10px;
+    font-weight: bold;
+    text-shadow: 1px 1px 1px black;
+}
+
+.color-input {
+    width: 24px;
+    height: 24px;
+    border: 2px solid var(--border-primary);
+    border-radius: 3px;
+    cursor: pointer;
+    background: none;
+    padding: 0;
+    grid-column: span 2;
+    justify-self: center;
+}
+
+.color-input:hover {
+    border-color: var(--text-secondary);
+}
+
+/* Thickness slider styles */
+.thickness-row {
+    display: flex;
+    align-items: center;
+    gap: 0.5rem;
+    margin-top: 0.75rem;
+}
+
+.thickness-slider {
+    flex: 1;
+    -webkit-appearance: none;
+    appearance: none;
+    height: 4px;
+    background: var(--border-primary);
+    border-radius: 2px;
+    outline: none;
+    opacity: 0.7;
+    transition: opacity 0.2s;
+}
+
+.thickness-slider:hover {
+    opacity: 1;
+}
+
+.thickness-slider::-webkit-slider-thumb {
+    -webkit-appearance: none;
+    appearance: none;
+    width: 12px;
+    height: 12px;
+    background: var(--text-link);
+    border-radius: 50%;
+    cursor: pointer;
+}
+
+.thickness-slider::-moz-range-thumb {
+    width: 12px;
+    height: 12px;
+    background: var(--text-link);
+    border-radius: 50%;
+    cursor: pointer;
+    border: none;
+}
+
+.thickness-value {
+    font-size: 0.7rem;
+    color: var(--text-secondary);
+    min-width: 20px;
+    text-align: right;
+}
+
+.highlight {
+    background: none !important;
+}
+
+/* Loading animations */
+.loading-spinner {
+    display: inline-block;
+    width: 16px;
+    height: 16px;
+    border: 2px solid var(--border-primary);
+    border-radius: 50%;
+    border-top-color: var(--text-link);
+    animation: spin 1s linear infinite;
+    margin-right: 8px;
+    vertical-align: middle;
+}
+
+@keyframes spin {
+    to {
+        transform: rotate(360deg);
+    }
+}
+
+.loading-skeleton {
+    display: inline-block;
+    background: var(--bg-tertiary);
+    background: linear-gradient(90deg,
+            var(--bg-tertiary) 25%,
+            var(--bg-secondary) 50%,
+            var(--bg-tertiary) 75%);
+    background-size: 200% 100%;
+    animation: loading-shimmer 2s ease-in-out infinite;
+    border-radius: 2px;
+    height: 1em;
+    width: 80px;
+    vertical-align: middle;
+}
+
+@keyframes loading-shimmer {
+    0% {
+        background-position: -200% 0;
+    }
+
+    100% {
+        background-position: 200% 0;
+    }
+}
+
+/* Loading state for cell output */
+.cell-output:has(.loading-spinner) {
+    opacity: 0.7;
+    background: var(--bg-secondary);
+    /* border-left: 3px solid var(--text-link); */
+}
+    </style>
+    <script>
+// --- Drag utilities ---
+        function clamp(val, min, max) { return Math.max(min, Math.min(max, val)); }
+
+        function restorePosition(el, storageKey) {
+            try {
+                const raw = localStorage.getItem(storageKey);
+                if (!raw) return;
+                const pos = JSON.parse(raw);
+                if (typeof pos.left === 'number' && typeof pos.top === 'number') {
+                    el.style.left = pos.left + 'px';
+                    el.style.top = pos.top + 'px';
+                    el.style.right = 'auto';
+                    el.style.bottom = 'auto';
+                }
+            } catch (_) {}
+        }
+
+        function savePosition(el, storageKey) {
+            try {
+                const left = parseFloat(el.style.left || 'NaN');
+                const top = parseFloat(el.style.top || 'NaN');
+                if (!Number.isNaN(left) && !Number.isNaN(top)) {
+                    localStorage.setItem(storageKey, JSON.stringify({ left, top }));
+                }
+            } catch (_) {}
+        }
+
+
+        function makeDraggable(el, storageKey, handleEl) {
+            let dragging = false;
+            let startX = 0, startY = 0; // cursor
+            let origLeft = 0, origTop = 0; // element
+
+            const onMove = (e) => {
+                if (!dragging) return;
+                const clientX = e.touches ? e.touches[0].clientX : e.clientX;
+                const clientY = e.touches ? e.touches[0].clientY : e.clientY;
+                const dx = clientX - startX;
+                const dy = clientY - startY;
+                const w = el.offsetWidth;
+                const h = el.offsetHeight;
+                const maxX = window.innerWidth - w;
+                const maxY = window.innerHeight - h;
+                const newLeft = clamp(origLeft + dx, 0, maxX);
+                const newTop = clamp(origTop + dy, 0, maxY);
+                el.style.left = newLeft + 'px';
+                el.style.top = newTop + 'px';
+                el.style.right = 'auto';
+                el.style.bottom = 'auto';
+            };
+
+            const endDrag = () => {
+                if (!dragging) return;
+                dragging = false;
+                document.removeEventListener('mousemove', onMove);
+                document.removeEventListener('mouseup', endDrag);
+                document.removeEventListener('touchmove', onMove);
+                document.removeEventListener('touchend', endDrag);
+                handleEl && (handleEl.style.cursor = 'grab');
+                savePosition(el, storageKey);
+                // ensure no-overlap constraint after a drag
+                try { layoutWidgetsStackedBottomRight(); } catch (_) {}
+            };
+
+            const startDrag = (e) => {
+                // Start from element's current on-screen rect
+                const elRect = el.getBoundingClientRect();
+                el.style.left = elRect.left + 'px';
+                el.style.top = elRect.top + 'px';
+                el.style.right = 'auto';
+                el.style.bottom = 'auto';
+
+                dragging = true;
+                startX = e.touches ? e.touches[0].clientX : e.clientX;
+                startY = e.touches ? e.touches[0].clientY : e.clientY;
+                origLeft = elRect.left;
+                origTop = elRect.top;
+
+                document.addEventListener('mousemove', onMove);
+                document.addEventListener('mouseup', endDrag);
+                document.addEventListener('touchmove', onMove, { passive: false });
+                document.addEventListener('touchend', endDrag);
+                handleEl && (handleEl.style.cursor = 'grabbing');
+                e.preventDefault();
+            };
+
+            (handleEl || el).addEventListener('mousedown', startDrag);
+            (handleEl || el).addEventListener('touchstart', startDrag, { passive: false });
+
+            // Apply any saved position on init
+            restorePosition(el, storageKey);
+        }
+        function toggleCell(cellId) {
+            const codeElement = document.getElementById('code-' + cellId);
+            const outputElement = document.getElementById('output-' + cellId);
+            
+            if (codeElement) {
+                codeElement.classList.toggle('collapsed');
+            }
+            if (outputElement) {
+                outputElement.classList.toggle('collapsed');
+            }
+            
+            updateIndicators(cellId);
+            encodeToolStateToUrl();
+        }
+        
+        function toggleCode(cellId) {
+            const codeElement = document.getElementById('code-' + cellId);
+            if (codeElement) {
+                codeElement.classList.toggle('collapsed');
+                updateIndicators(cellId);
+                encodeToolStateToUrl();
+            }
+        }
+        
+        function toggleOutput(cellId) {
+            const outputElement = document.getElementById('output-' + cellId);
+            if (outputElement) {
+                outputElement.classList.toggle('collapsed');
+                updateIndicators(cellId);
+                encodeToolStateToUrl();
+            }
+        }
+
+        function toggleUvLogs(headerElement) {
+            const contentElement = headerElement.nextElementSibling;
+            if (contentElement) {
+                const isCollapsed = contentElement.style.display === 'none';
+                contentElement.style.display = isCollapsed ? 'block' : 'none';
+                headerElement.textContent = isCollapsed ? '▼ UV Install Logs' : '▶ UV Install Logs';
+
+                // Update the header indicator if it exists
+                const uvLogsDiv = headerElement.parentElement;
+                if (uvLogsDiv && uvLogsDiv.id && uvLogsDiv.id.startsWith('uv-logs-')) {
+                    const cellId = uvLogsDiv.id.replace('uv-logs-', '');
+                    const indicatorElement = document.getElementById('uv-indicator-' + cellId);
+                    if (indicatorElement) {
+                        indicatorElement.textContent = isCollapsed ? '▼ uv-logs' : '▶ uv-logs';
+                    }
+                }
+            }
+        }
+
+        function toggleUvLogsFromHeader(cellId) {
+            const uvLogsElement = document.getElementById('uv-logs-' + cellId);
+            const indicatorElement = document.getElementById('uv-indicator-' + cellId);
+            if (uvLogsElement) {
+                const headerElement = uvLogsElement.querySelector('.uv-logs-header');
+                const contentElement = uvLogsElement.querySelector('.uv-logs-content');
+                if (contentElement && headerElement) {
+                    const isCollapsed = contentElement.style.display === 'none';
+                    contentElement.style.display = isCollapsed ? 'block' : 'none';
+                    headerElement.textContent = isCollapsed ? '▼ UV Install Logs' : '▶ UV Install Logs';
+                    if (indicatorElement) {
+                        indicatorElement.textContent = isCollapsed ? '▼ uv-logs' : '▶ uv-logs';
+                    }
+                }
+            }
+        }
+        
+        function updateIndicators(cellId) {
+            const codeElement = document.getElementById('code-' + cellId);
+            const outputElement = document.getElementById('output-' + cellId);
+            const indicators = document.querySelector(`[onclick*="${cellId}"]`)?.closest('.cell-header')?.querySelector('.collapse-indicators');
+            
+            if (indicators) {
+                const codeCollapsed = codeElement && codeElement.classList.contains('collapsed');
+                const outputCollapsed = outputElement && outputElement.classList.contains('collapsed');
+                
+                const codeIcon = codeCollapsed ? '▶' : '▼';
+                const outputIcon = outputCollapsed ? '▶' : '▼';
+                
+                const codeSpan = indicators.querySelector('[onclick*="toggleCode"]');
+                const outputSpan = indicators.querySelector('[onclick*="toggleOutput"]');
+                
+                if (codeSpan) codeSpan.innerHTML = `${codeIcon} code`;
+                if (outputSpan) outputSpan.innerHTML = `${outputIcon} output`;
+            }
+        }
+        
+        function toggleTheme() {
+            const html = document.documentElement;
+            const currentTheme = html.getAttribute('data-theme');
+            const newTheme = currentTheme === 'dark' ? 'light' : 'dark';
+            html.setAttribute('data-theme', newTheme);
+            localStorage.setItem('uvnote-theme', newTheme);
+            updateThemeIcon();
+            updateUiDebug();
+        }
+        
+        // Two panel code removed
+        
+        function updateThemeIcon() {
+            const theme = document.documentElement.getAttribute('data-theme');
+            const toggle = document.querySelector('.theme-toggle');
+            if (toggle) {
+                toggle.textContent = theme === 'dark' ? 'light' : 'dark';
+            }
+        }
+        function setUiTheme(newUi) {
+            if (newUi !== 'default' && newUi !== 'none' && newUi !== 'monocolor') return;
+            const html = document.documentElement;
+            html.setAttribute('data-ui', newUi);
+            try { localStorage.setItem('uvnote-ui', newUi); } catch (_) {}
+            updateUiMenu();
+            updateUiDebug();
+        }
+        function updateUiMenu() {
+            const ui = document.documentElement.getAttribute('data-ui') || 'default';
+            const checks = {
+                default: document.getElementById('checkbox-ui-default'),
+                none: document.getElementById('checkbox-ui-none'),
+                monocolor: document.getElementById('checkbox-ui-monocolor')
+            };
+            if (checks.default) checks.default.textContent = ui === 'default' ? '☑' : '☐';
+            if (checks.none) checks.none.textContent = ui === 'none' ? '☑' : '☐';
+            if (checks.monocolor) checks.monocolor.textContent = ui === 'monocolor' ? '☑' : '☐';
+        }
+
+        function updateUiDebug() {
+            const ui = document.documentElement.getAttribute('data-ui') || 'default';
+            const color = document.documentElement.getAttribute('data-theme') || 'light';
+            const el = document.getElementById('ui-debug');
+            if (el) {
+                el.textContent = `UI: ${ui} | Color: ${color}`;
+            }
+        }
+
+        // Line selection and deep-linking
+        function clearLineSelections() {
+            try {
+                document.querySelectorAll('.code-line-highlight').forEach(el => { el.style.display = 'none'; });
+                document.querySelectorAll('.line-number.selected').forEach(el => el.classList.remove('selected'));
+            } catch (_) {}
+        }
+
+        let _selection = null; // { cellId, a, b }
+
+        function clearSelection(updateUrl) {
+            clearLineSelections();
+            _selection = null;
+            if (updateUrl) {
+                try {
+                    const url = new URL(window.location.href);
+                    url.searchParams.delete('cell');
+                    url.searchParams.delete('line');
+                    history.replaceState(null, '', url.toString());
+                } catch (_) {}
+            }
+            updateStateIndicator();
+        }
+
+        function selectCellLine(cellId, line, updateUrl) {
+            try {
+                // Ensure only one selection across the whole document
+                clearLineSelections();
+                const codeBox = document.getElementById(`code-${cellId}`);
+                if (!codeBox) return;
+                const pre = codeBox.querySelector('.highlight pre');
+                const overlay = document.getElementById(`line-highlight-${cellId}`);
+                const numbers = document.getElementById(`lines-${cellId}`);
+                if (!pre || !overlay) return;
+
+                // Measure line height directly from computed style
+                const preStyle = getComputedStyle(pre);
+                const padTop = parseFloat(preStyle.paddingTop || '0');
+                const lh = parseFloat(preStyle.lineHeight || '20');
+
+                // Position overlay
+                overlay.style.display = 'block';
+                overlay.style.height = `${lh}px`;
+                overlay.style.top = `${pre.offsetTop + padTop + (line - 1) * lh}px`;
+
+                // Update selected class in line numbers
+                if (numbers) {
+                    numbers.querySelectorAll('.line-number').forEach(a => a.classList.remove('selected'));
+                    const sel = numbers.querySelector(`.line-number[data-line="${line}"]`);
+                    if (sel) sel.classList.add('selected');
+                }
+
+                if (updateUrl) {
+                    const url = new URL(window.location.href);
+                    url.searchParams.set('cell', cellId);
+                    url.searchParams.set('line', String(line));
+                    history.replaceState(null, '', url.toString());
+                }
+                _selection = { cellId, a: line, b: line };
+                updateStateIndicator();
+            } catch (e) { console.warn('selectCellLine error', e); }
+        }
+
+        function selectCellLines(cellId, startLine, endLine, updateUrl) {
+            try {
+                // normalize order
+                const a = Math.min(startLine, endLine);
+                const b = Math.max(startLine, endLine);
+                clearLineSelections();
+                const codeBox = document.getElementById(`code-${cellId}`);
+                if (!codeBox) return;
+                const pre = codeBox.querySelector('.highlight pre');
+                const overlay = document.getElementById(`line-highlight-${cellId}`);
+                const numbers = document.getElementById(`lines-${cellId}`);
+                if (!pre || !overlay) return;
+
+                const preStyle = getComputedStyle(pre);
+                const padTop = parseFloat(preStyle.paddingTop || '0');
+                const lh = parseFloat(preStyle.lineHeight || '20');
+
+                overlay.style.display = 'block';
+                overlay.style.top = `${pre.offsetTop + padTop + (a - 1) * lh}px`;
+                overlay.style.height = `${(b - a + 1) * lh}px`;
+
+                if (numbers) {
+                    numbers.querySelectorAll('.line-number').forEach(a => a.classList.remove('selected'));
+                    for (let i = a; i <= b; i++) {
+                        const el = numbers.querySelector(`.line-number[data-line="${i}"]`);
+                        if (el) el.classList.add('selected');
+                    }
+                }
+
+                if (updateUrl) {
+                    const url = new URL(window.location.href);
+                    url.searchParams.set('cell', cellId);
+                    if (a === b) url.searchParams.set('line', String(a));
+                    else url.searchParams.set('line', `${a}-${b}`);
+                    history.replaceState(null, '', url.toString());
+                }
+                _selection = { cellId, a, b };
+                updateStateIndicator();
+            } catch (e) { console.warn('selectCellLines error', e); }
+        }
+
+        // Drag-to-select support on line numbers
+        let _lineDrag = { active: false, cellId: null, start: 0 };
+        function onLineNumberMouseDown(e) {
+            const a = e.target.closest('.line-number');
+            if (!a) return;
+            e.preventDefault();
+            const cellId = a.dataset.cell;
+            const line = parseInt(a.dataset.line || '1', 10) || 1;
+            // Toggle off if this exact single line is already the only selection
+            const numbers = document.getElementById(`lines-${cellId}`);
+            if (numbers) {
+                const selected = Array.from(numbers.querySelectorAll('.line-number.selected')).map(n => parseInt(n.dataset.line||'0',10)).filter(Boolean);
+                if (selected.length === 1 && selected[0] === line) {
+                    clearSelection(true);
+                    return;
+                }
+            }
+            _lineDrag.active = true;
+            _lineDrag.cellId = cellId;
+            _lineDrag.start = line;
+            selectCellLines(_lineDrag.cellId, _lineDrag.start, _lineDrag.start, false);
+        }
+        function onDocMouseMove(e) {
+            if (!_lineDrag.active) return;
+            const el = document.elementFromPoint(e.clientX, e.clientY);
+            if (!el) return;
+            const a = el.closest && el.closest('.line-number');
+            if (!a) return;
+            if (a.dataset.cell !== _lineDrag.cellId) return;
+            const cur = parseInt(a.dataset.line || '1', 10) || 1;
+            selectCellLines(_lineDrag.cellId, _lineDrag.start, cur, false);
+        }
+        function onDocMouseUp(e) {
+            if (!_lineDrag.active) return;
+            const last = document.querySelector('.line-number.selected:last-of-type');
+            // finalize URL using the current selected range
+            const numbers = document.getElementById(`lines-${_lineDrag.cellId}`);
+            if (numbers) {
+                const selected = Array.from(numbers.querySelectorAll('.line-number.selected')).map(n => parseInt(n.dataset.line||'0',10)).filter(Boolean);
+                if (selected.length) {
+                    const a = Math.min(...selected); const b = Math.max(...selected);
+                    selectCellLines(_lineDrag.cellId, a, b, true);
+                }
+            }
+            _lineDrag.active = false; _lineDrag.cellId = null; _lineDrag.start = 0;
+        }
+
+        function applyLocationFromUrl() {
+            try {
+                const url = new URL(window.location.href);
+                const cell = url.searchParams.get('cell');
+                const lineParam = url.searchParams.get('line');
+                if (cell && lineParam) {
+                    if (lineParam.includes('-')) {
+                        const [a, b] = lineParam.split('-').map(x => parseInt(x, 10));
+                        if (!Number.isNaN(a) && !Number.isNaN(b)) selectCellLines(cell, a, b, false);
+                    } else {
+                        const l = parseInt(lineParam, 10);
+                        if (!Number.isNaN(l)) selectCellLine(cell, l, false);
+                    }
+                }
+                
+                // Apply tool parameters from URL
+                applyToolsFromUrl(url.searchParams);
+                
+                // Cell states will be applied later in DOMContentLoaded with proper timing
+                const encodedCellStates = url.searchParams.get('cells');
+                console.log('Encoded cell states from URL:', encodedCellStates);
+            } catch (_) {}
+        }
+        
+        function applyToolsFromUrl(params) {
+            try {
+                // Check if tools widget should be shown
+                const showTools = params.get('tools');
+                if (showTools === '1') {
+                    // Mark that tool was loaded from URL
+                    _urlLoadedTool = true;
+                    
+                    // Apply color
+                    const color = params.get('color');
+                    if (color && /^[0-9a-fA-F]{6}$/.test(color)) {
+                        setStoredArrowColor('#' + color);
+                    }
+                    
+                    // Apply thickness
+                    const thickness = params.get('thickness');
+                    if (thickness) {
+                        const value = parseInt(thickness, 10);
+                        if (value >= 1 && value <= 10) {
+                            setStoredLineThickness(value);
+                        }
+                    }
+                    
+                    // Don't override fadeout time for URL-loaded tools - let individual shapes decide
+                    
+                    // Load shapes from URL
+                    const encodedShapes = params.get('shapes');
+                    if (encodedShapes) {
+                        const decodedShapes = decodeShapesFromUrl(encodedShapes);
+                        if (decodedShapes.length > 0) {
+                            _shapes = decodedShapes;
+                            saveShapes();
+                            // Trigger render after overlay is initialized
+                            setTimeout(() => {
+                                renderOverlay();
+                            }, 300);
+                        }
+                    }
+                    
+                    // Wait for widgets to be initialized before showing tools
+                    setTimeout(() => {
+                        const toolsWidget = document.querySelector('.tools-widget');
+                        const checkbox = document.getElementById('checkbox-tools');
+                        if (toolsWidget && checkbox) {
+                            toolsWidget.style.display = 'block';
+                            checkbox.textContent = '☑';
+                            localStorage.setItem('uvnote-widget-tools', 'visible');
+                        }
+                        
+                        // Apply active tool
+                        const activeTool = params.get('tool');
+                        if (activeTool && ['arrow', 'pen', 'eraser', 'spotlight'].includes(activeTool)) {
+                            const toolBtn = Array.from(document.querySelectorAll('.tool-button')).find(btn => btn.textContent === activeTool);
+                            if (toolBtn) {
+                                toolBtn.click();
+                            }
+                        }
+                        
+                        // Re-layout widgets after showing tools
+                        layoutWidgetsStackedBottomRight();
+                    }, 200);
+                }
+            } catch (_) {}
+        }
+        
+        function captureInitialCellStates() {
+            const cells = document.querySelectorAll('.cell');
+            cells.forEach(cell => {
+                const cellId = cell.id.replace('cell-', '');
+                const codeEl = document.getElementById('code-' + cellId);
+                const outputEl = document.getElementById('output-' + cellId);
+                
+                if (codeEl || outputEl) {
+                    const state = {};
+                    if (codeEl) {
+                        state.c = codeEl.classList.contains('collapsed') ? 0 : 1;
+                    }
+                    if (outputEl) {
+                        state.o = outputEl.classList.contains('collapsed') ? 0 : 1;
+                    }
+                    _initialCellStates[cellId] = state;
+                }
+            });
+            console.log('Captured initial cell states:', _initialCellStates);
+        }
+
+        function encodeCellStatesToUrl() {
+            // Get all cells and their collapse states
+            const cells = document.querySelectorAll('.cell');
+            const cellStates = {};
+            
+            console.log('Found cells:', cells.length);
+            
+            cells.forEach(cell => {
+                const cellId = cell.id.replace('cell-', '');
+                const codeEl = document.getElementById('code-' + cellId);
+                const outputEl = document.getElementById('output-' + cellId);
+                const initialState = _initialCellStates[cellId] || {};
+                
+                console.log(`Encoding cell ${cellId}:`, {
+                    codeEl: !!codeEl,
+                    outputEl: !!outputEl,
+                    codeCollapsed: codeEl ? codeEl.classList.contains('collapsed') : 'N/A',
+                    outputCollapsed: outputEl ? outputEl.classList.contains('collapsed') : 'N/A',
+                    initialState: initialState
+                });
+                
+                if (codeEl || outputEl) {
+                    const state = {};
+                    let hasChanges = false;
+                    
+                    if (codeEl) {
+                        const currentCodeState = codeEl.classList.contains('collapsed') ? 0 : 1;
+                        const initialCodeState = initialState.c;
+                        // Only encode if different from initial state
+                        if (initialCodeState !== undefined && currentCodeState !== initialCodeState) {
+                            state.c = currentCodeState;
+                            hasChanges = true;
+                        }
+                    }
+                    
+                    if (outputEl) {
+                        const currentOutputState = outputEl.classList.contains('collapsed') ? 0 : 1;
+                        const initialOutputState = initialState.o;
+                        // Only encode if different from initial state
+                        if (initialOutputState !== undefined && currentOutputState !== initialOutputState) {
+                            state.o = currentOutputState;
+                            hasChanges = true;
+                        }
+                    }
+                    
+                    // Only include cell if it has changes from initial state
+                    if (hasChanges) {
+                        cellStates[cellId] = state;
+                        console.log(`Added cell ${cellId}:`, state);
+                    }
+                }
+            });
+            
+            console.log('Final cell states to encode:', cellStates);
+            
+            // Return empty string if no changed cells
+            if (Object.keys(cellStates).length === 0) return '';
+            
+            // Encode as compact base64 string
+            const encoded = btoa(JSON.stringify(cellStates));
+            console.log('Encoded cell states:', encoded);
+            return encoded;
+        }
+        
+        function decodeCellStatesFromUrl(encodedStates) {
+            if (!encodedStates) return {};
+            
+            try {
+                return JSON.parse(atob(encodedStates));
+            } catch (e) {
+                console.error('Failed to decode cell states:', e);
+                return {};
+            }
+        }
+        
+        function applyCellStatesFromUrl(cellStates) {
+            console.log('Applying cell states from URL:', cellStates);
+            Object.entries(cellStates).forEach(([cellId, state]) => {
+                const codeEl = document.getElementById('code-' + cellId);
+                const outputEl = document.getElementById('output-' + cellId);
+                
+                console.log(`Cell ${cellId}:`, {
+                    codeEl: !!codeEl,
+                    outputEl: !!outputEl,
+                    state: state
+                });
+                
+                if (codeEl && state.c !== undefined) {
+                    if (state.c === 0) {
+                        codeEl.classList.add('collapsed');
+                        console.log(`Collapsed code for cell ${cellId}`, {
+                            hasCollapsedClass: codeEl.classList.contains('collapsed'),
+                            computedDisplay: getComputedStyle(codeEl).display,
+                            classList: Array.from(codeEl.classList),
+                            elementId: codeEl.id
+                        });
+                    } else {
+                        codeEl.classList.remove('collapsed');
+                        codeEl.classList.add('expanded'); // Explicitly add expanded class
+                        console.log(`Expanded code for cell ${cellId}`, {
+                            hasCollapsedClass: codeEl.classList.contains('collapsed'),
+                            hasExpandedClass: codeEl.classList.contains('expanded'),
+                            computedDisplay: getComputedStyle(codeEl).display,
+                            classList: Array.from(codeEl.classList),
+                            elementId: codeEl.id
+                        });
+                    }
+                }
+                
+                if (outputEl && state.o !== undefined) {
+                    if (state.o === 0) {
+                        outputEl.classList.add('collapsed');
+                        console.log(`Collapsed output for cell ${cellId}`);
+                    } else {
+                        outputEl.classList.remove('collapsed');
+                        console.log(`Expanded output for cell ${cellId}`);
+                    }
+                }
+                
+                // Update visual indicators and force style recalculation
+                try {
+                    updateIndicators(cellId);
+                    // Force browser to recalculate styles
+                    if (codeEl) {
+                        codeEl.offsetHeight; // Force reflow
+                        console.log(`After indicators update - code visible: ${getComputedStyle(codeEl).display !== 'none'}`);
+                    }
+                    if (outputEl) {
+                        outputEl.offsetHeight; // Force reflow
+                        console.log(`After indicators update - output visible: ${getComputedStyle(outputEl).display !== 'none'}`);
+                    }
+                } catch (e) {
+                    console.error(`Error updating indicators for cell ${cellId}:`, e);
+                }
+            });
+        }
+
+        function encodeShapesToUrl() {
+            // Encode shapes as compact base64 string
+            if (_shapes.length === 0) return '';
+            
+            const shapeData = _shapes.map(shape => {
+                const baseData = {
+                    ct: shape.createdAt, // creation timestamp
+                    fo: shape.fadeoutTime || getFadeoutTime() // fadeout time for this shape
+                };
+                
+                if (shape.type === 'arrow') {
+                    return {
+                        ...baseData,
+                        t: 'a',
+                        x1: Math.round(shape.x1),
+                        y1: Math.round(shape.y1),
+                        x2: Math.round(shape.x2),
+                        y2: Math.round(shape.y2),
+                        c: shape.color.substring(1), // remove #
+                        w: shape.width
+                    };
+                } else if (shape.type === 'pen') {
+                    return {
+                        ...baseData,
+                        t: 'p',
+                        pts: shape.points.map(p => [Math.round(p.x), Math.round(p.y)]),
+                        c: shape.color.substring(1),
+                        w: shape.width
+                    };
+                } else if (shape.type === 'spotlight') {
+                    return {
+                        ...baseData,
+                        t: 's',
+                        x: Math.round(shape.x),
+                        y: Math.round(shape.y),
+                        r: Math.round(shape.radius)
+                    };
+                }
+            }).filter(Boolean);
+            
+            return btoa(JSON.stringify(shapeData));
+        }
+        
+        function decodeShapesFromUrl(encodedShapes) {
+            if (!encodedShapes) return [];
+            
+            try {
+                const shapeData = JSON.parse(atob(encodedShapes));
+                return shapeData.map(data => {
+                    const base = {
+                        createdAt: data.ct || Date.now(), // use encoded timestamp or current time
+                        fadeoutTime: data.fo || 0, // use encoded fadeout time or 0 (never fade)
+                        opacity: 1.0
+                    };
+                    
+                    if (data.t === 'a') {
+                        return {
+                            ...base,
+                            type: 'arrow',
+                            x1: data.x1,
+                            y1: data.y1,
+                            x2: data.x2,
+                            y2: data.y2,
+                            color: '#' + data.c,
+                            width: data.w
+                        };
+                    } else if (data.t === 'p') {
+                        return {
+                            ...base,
+                            type: 'pen',
+                            points: data.pts.map(([x, y]) => ({ x, y })),
+                            color: '#' + data.c,
+                            width: data.w
+                        };
+                    } else if (data.t === 's') {
+                        return {
+                            ...base,
+                            type: 'spotlight',
+                            x: data.x,
+                            y: data.y,
+                            radius: data.r,
+                            color: '#000000'
+                        };
+                    }
+                }).filter(Boolean);
+            } catch (e) {
+                console.error('Failed to decode shapes:', e);
+                return [];
+            }
+        }
+        
+        function encodeToolStateToUrl() {
+            // Don't update URL during initialization
+            if (_isInitializing) {
+                return window.location.href;
+            }
+            
+            const params = new URLSearchParams(window.location.search);
+            
+            // Check if tools widget is visible and has an active tool
+            const toolsWidget = document.querySelector('.tools-widget');
+            const activeTool = document.body.dataset.tool;
+            const hasActiveTool = activeTool && activeTool !== 'none';
+            const toolsWidgetVisible = toolsWidget && getComputedStyle(toolsWidget).display !== 'none';
+            
+            // Always handle shapes regardless of tool state
+            const encodedShapes = encodeShapesToUrl();
+            if (encodedShapes) {
+                params.set('shapes', encodedShapes);
+            } else {
+                params.delete('shapes');
+            }
+            
+            // Always preserve existing cell states from URL if present
+            const existingCellStates = params.get('cells');
+            if (existingCellStates) {
+                // Keep existing cell states - don't re-encode from DOM
+                params.set('cells', existingCellStates);
+            } else {
+                // Only encode new cell states if none exist in URL
+                const encodedCellStates = encodeCellStatesToUrl();
+                if (encodedCellStates) {
+                    params.set('cells', encodedCellStates);
+                }
+            }
+            
+            if (toolsWidgetVisible && hasActiveTool) {
+                // Include tool params when widget is visible AND tool is active
+                params.set('tools', '1');
+                params.set('tool', activeTool);
+                
+                // Get color (without # prefix)
+                const color = getArrowColor();
+                if (color && color.startsWith('#')) {
+                    params.set('color', color.substring(1));
+                }
+                
+                // Get thickness
+                const thickness = getLineThickness();
+                params.set('thickness', thickness.toString());
+            } else {
+                // Remove tool state params but keep shapes
+                params.delete('tools');
+                params.delete('tool');
+                params.delete('color');
+                params.delete('thickness');
+                params.delete('fadeout');
+            }
+            
+            // Update URL without reloading
+            const newUrl = window.location.pathname + (params.toString() ? '?' + params.toString() : '') + window.location.hash;
+            window.history.replaceState(null, '', newUrl);
+            
+            return window.location.href;
+        }
+
+        function resetLayout() {
+            try {
+                // Clear all uvnote-* keys
+                const allKeys = Object.keys(localStorage);
+                const uvnoteKeys = allKeys.filter(key => key.startsWith('uvnote-'));
+                uvnoteKeys.forEach(k => localStorage.removeItem(k));
+            } catch (_) {}
+            
+            // Clear any active selection and remove URL params
+            try { clearSelection(true); } catch(_) {}
+            // Reset active tool if any
+            try { window.setActiveTool('none'); } catch(_) {}
+            // Clear shapes
+            try { _shapes = []; saveShapes(); } catch(_) {}
+            // Reset URL-loaded tool flag
+            try { _urlLoadedTool = false; } catch(_) {}
+            // Reset all cells to expanded state
+            try {
+                const cells = document.querySelectorAll('.cell');
+                cells.forEach(cell => {
+                    const cellId = cell.id.replace('cell-', '');
+                    const codeEl = document.getElementById('code-' + cellId);
+                    const outputEl = document.getElementById('output-' + cellId);
+                    if (codeEl) codeEl.classList.remove('collapsed');
+                    if (outputEl) outputEl.classList.remove('collapsed');
+                    updateIndicators(cellId);
+                });
+            } catch(_) {}
+            
+            // Clear ALL URL parameters and reload with clean URL
+            try {
+                const cleanUrl = window.location.pathname + window.location.hash;
+                window.location.href = cleanUrl; // Use window.location.href instead of history.replaceState + reload
+            } catch (_) {
+                // Fallback - reload current page
+                location.reload();
+            }
+        }
+
+        function toggleMenu() {
+            const menuButton = document.querySelector('.menu-button');
+            if (menuButton) {
+                menuButton.classList.toggle('active');
+            }
+        }
+
+        function toggleWidget(widgetName) {
+            let widget;
+            let checkbox;
+            
+            // Close the menu first
+            const menuButton = document.querySelector('.menu-button');
+            if (menuButton) {
+                menuButton.classList.remove('active');
+            }
+            
+            switch(widgetName) {
+                case 'tools':
+                    widget = document.querySelector('.tools-widget');
+                    checkbox = document.getElementById('checkbox-tools');
+                    break;
+                case 'file-explorer':
+                    widget = document.querySelector('.file-explorer');
+                    checkbox = document.getElementById('checkbox-file-explorer');
+                    break;
+                case 'minimap':
+                    widget = document.querySelector('.minimap');
+                    checkbox = document.getElementById('checkbox-minimap');
+                    break;
+                case 'status':
+                    widget = document.querySelector('.status-widget');
+                    checkbox = document.getElementById('checkbox-status');
+                    break;
+                default:
+                    return;
+            }
+            
+            if (widget && checkbox) {
+                const isVisible = getComputedStyle(widget).display !== 'none';
+                widget.style.display = isVisible ? 'none' : 'block';
+                checkbox.textContent = isVisible ? '☐' : '☑';
+                
+                // Save state to localStorage
+                try {
+                    localStorage.setItem(`uvnote-widget-${widgetName}`, isVisible ? 'hidden' : 'visible');
+                } catch (_) {}
+                
+                // Re-layout widgets after visibility change
+                try { 
+                    layoutWidgetsStackedBottomRight(); 
+                } catch (_) {}
+                
+                // Update URL when tools widget visibility changes
+                if (widgetName === 'tools') {
+                    encodeToolStateToUrl();
+                }
+            }
+        }
+        
+        function initializeWidgetVisibility() {
+            const widgets = [
+                { name: 'tools', selector: '.tools-widget' },
+                { name: 'file-explorer', selector: '.file-explorer' },
+                { name: 'minimap', selector: '.minimap' },
+                { name: 'status', selector: '.status-widget' }
+            ];
+            
+            widgets.forEach(({ name, selector }) => {
+                const defaultState = name === 'status' ? 'visible' : 'hidden';
+                const savedState = localStorage.getItem(`uvnote-widget-${name}`) || defaultState;
+                const widget = document.querySelector(selector);
+                const checkbox = document.getElementById(`checkbox-${name}`);
+                
+                if (widget && checkbox) {
+                    const isVisible = savedState === 'visible';
+                    widget.style.display = isVisible ? 'block' : 'none';
+                    checkbox.textContent = isVisible ? '☑' : '☐';
+                }
+            });
+        }
+
+        // Close menu when clicking outside
+        document.addEventListener('click', function(event) {
+            const menuButton = document.querySelector('.menu-button');
+            // Don't close if clicking on a menu item (let the item handler close it)
+            if (menuButton && !menuButton.contains(event.target)) {
+                menuButton.classList.remove('active');
+            }
+        });
+
+        // Layout: stack widgets bottom-right and equalize widths
+        function hasCustomWidgetPositions() {
+            try {
+                return (
+                    localStorage.getItem('uvnote-minimap-pos') ||
+                    localStorage.getItem('uvnote-file-explorer-pos') ||
+                    localStorage.getItem('uvnote-tools-pos')
+                );
+            } catch (_) { return false; }
+        }
+
+        function rectsOverlap(r1, r2) {
+            return !(r1.right <= r2.left || r2.right <= r1.left || r1.bottom <= r2.top || r2.bottom <= r1.top);
+        }
+
+        function widgetsOverlap(widgets) {
+            for (let i = 0; i < widgets.length; i++) {
+                const a = widgets[i];
+                const ra = a.getBoundingClientRect();
+                for (let j = i + 1; j < widgets.length; j++) {
+                    const b = widgets[j];
+                    const rb = b.getBoundingClientRect();
+                    if (rectsOverlap(ra, rb)) return true;
+                }
+            }
+            return false;
+        }
+
+        function applyStackLayout(widgets, order) {
+            if (!widgets.length) return;
+            // Fixed equal width
+            const fixedWidth = 220;
+            widgets.forEach(el => { el.style.width = fixedWidth + 'px'; });
+
+            // Fit heights if needed to avoid overflow
+            const gap = 12;
+            const available = Math.max(0, window.innerHeight - 40 - gap * (order.length - 1));
+            const eachMax = Math.floor(available / order.length);
+            order.forEach(el => {
+                el.style.maxHeight = eachMax + 'px';
+                el.style.overflowY = 'auto';
+            });
+
+            // Stack bottom-up in the requested order
+            let bottomOffset = 20; // base gutter
+            order.forEach(el => {
+                el.style.left = 'auto';
+                el.style.top = 'auto';
+                el.style.right = '20px';
+                el.style.bottom = bottomOffset + 'px';
+                bottomOffset += el.offsetHeight + gap;
+            });
+        }
+
+        function layoutWidgetsStackedBottomRight() {
+            const minimap = document.querySelector('.minimap');
+            const fileExplorer = document.querySelector('.file-explorer');
+            const tools = document.querySelector('.tools-widget');
+            const status = document.querySelector('.status-widget');
+            const widgets = [minimap, fileExplorer, tools, status].filter(el => el && getComputedStyle(el).display !== 'none');
+            if (!widgets.length) return;
+
+            const order = [minimap, fileExplorer, tools, status].filter(Boolean).filter(el => getComputedStyle(el).display !== 'none');
+
+            // If user placed custom positions and there is no overlap, respect them.
+            if (hasCustomWidgetPositions() && !widgetsOverlap(widgets)) return;
+
+            applyStackLayout(widgets, order);
+        }
+        
+        // Panel icon removed
+        
+        let _minimapScrollContainer = null;
+        let _minimapScrollHandler = null;
+        function initMinimap() {
+            // Generate minimap content
+            const minimap = createMinimap();
+            document.body.appendChild(minimap);
+            // Make draggable (use title as handle)
+            const mTitle = minimap.querySelector('.minimap-title');
+            makeDraggable(minimap, 'uvnote-minimap-pos', mTitle);
+
+            // Attach scroll listener to window (two-panel removed)
+            _minimapScrollContainer = window;
+
+            if (_minimapScrollContainer) {
+                _minimapScrollHandler = () => updateMinimapActive();
+                if (_minimapScrollContainer === window) {
+                    window.addEventListener('scroll', _minimapScrollHandler);
+                } else {
+                    _minimapScrollContainer.addEventListener('scroll', _minimapScrollHandler);
+                }
+            }
+            updateMinimapActive();
+        }
+
+        function teardownMinimap() {
+            const minimap = document.querySelector('.minimap');
+            if (minimap && minimap.parentNode) minimap.parentNode.removeChild(minimap);
+            if (_minimapScrollContainer && _minimapScrollHandler) {
+                if (_minimapScrollContainer === window) {
+                    window.removeEventListener('scroll', _minimapScrollHandler);
+                } else {
+                    _minimapScrollContainer.removeEventListener('scroll', _minimapScrollHandler);
+                }
+            }
+            _minimapScrollContainer = null;
+            _minimapScrollHandler = null;
+        }
+        
+        function initFileExplorer() {
+            // Generate file explorer content
+            const fileExplorer = createFileExplorer();
+            document.body.appendChild(fileExplorer);
+        }
+        
+        function createMinimap() {
+            const minimap = document.createElement('div');
+            minimap.className = 'minimap';
+            
+            const title = document.createElement('div');
+            title.className = 'minimap-title';
+            title.textContent = 'navigation';
+            minimap.appendChild(title);
+            
+            // Find all headings and cells
+            const root = document.querySelector('.main-content') || document;
+            const headings = root.querySelectorAll('h1, h2, h3, h4, h5, h6');
+            const cells = root.querySelectorAll('.cell');
+            
+            // Combine and sort by position
+            const items = [];
+            
+            headings.forEach(heading => {
+                const id = heading.id || generateId(heading.textContent);
+                if (!heading.id) heading.id = id;
+                
+                items.push({
+                    element: heading,
+                    type: 'heading',
+                    level: parseInt(heading.tagName.charAt(1)),
+                    text: heading.textContent.trim(),
+                    id: id,
+                    position: heading.getBoundingClientRect().top + window.scrollY
+                });
+            });
+            
+            cells.forEach(cell => {
+                const header = cell.querySelector('.cell-header');
+                if (header) {
+                    const id = cell.id || `cell-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`;
+                    if (!cell.id) cell.id = id;
+                    
+                    items.push({
+                        element: cell,
+                        type: 'cell',
+                        text: header.textContent.trim(),
+                        id: id,
+                        position: cell.getBoundingClientRect().top + window.scrollY
+                    });
+                }
+            });
+            
+            // Sort by position
+            items.sort((a, b) => a.position - b.position);
+            
+            // Create minimap items
+            items.forEach(item => {
+                const link = document.createElement('a');
+                link.className = `minimap-item ${item.type === 'heading' ? 'minimap-heading' : 'minimap-cell'}`;
+                if (item.type === 'heading') {
+                    link.classList.add(`h${item.level}`);
+                }
+                link.textContent = item.text.length > 25 ? item.text.substring(0, 22) + '...' : item.text;
+                link.href = `#${item.id}`;
+                link.onclick = function(e) {
+                    e.preventDefault();
+                    item.element.scrollIntoView({ behavior: 'smooth', block: 'start' });
+                };
+                minimap.appendChild(link);
+            });
+            
+            return minimap;
+        }
+        
+        function generateId(text) {
+            return text.toLowerCase()
+                .replace(/[^a-z0-9]+/g, '-')
+                .replace(/^-+|-+$/g, '')
+                .substring(0, 20);
+        }
+        
+        function updateMinimapActive() {
+            const minimapItems = document.querySelectorAll('.minimap-item');
+            const container = _minimapScrollContainer || window;
+            const containerRect = container === window ? null : container.getBoundingClientRect();
+            const scrollPos = (container === window ? window.scrollY : container.scrollTop) + 100; // Offset for better detection
+            
+            let activeItem = null;
+            minimapItems.forEach(item => {
+                const targetId = item.getAttribute('href').substring(1);
+                const target = document.getElementById(targetId);
+                
+                if (target) {
+                    const rectTop = target.getBoundingClientRect().top;
+                    const targetPos = (container === window)
+                        ? rectTop + window.scrollY
+                        : rectTop - containerRect.top + container.scrollTop;
+                    if (targetPos <= scrollPos) {
+                        activeItem = item;
+                    }
+                }
+                
+                item.classList.remove('active');
+            });
+            
+            if (activeItem) {
+                activeItem.classList.add('active');
+            }
+        }
+        
+        function createFileExplorer() {
+            const fileExplorer = document.createElement('div');
+            fileExplorer.className = 'file-explorer';
+            
+            const title = document.createElement('div');
+            title.className = 'file-explorer-title';
+            title.textContent = 'files';
+            fileExplorer.appendChild(title);
+            // Make draggable (use title as handle)
+            makeDraggable(fileExplorer, 'uvnote-file-explorer-pos', title);
+            
+            // Scripts section
+            const scriptsSection = document.createElement('div');
+            scriptsSection.className = 'file-explorer-section';
+            
+            const scriptsTitle = document.createElement('div');
+            scriptsTitle.className = 'file-explorer-section-title';
+            scriptsTitle.textContent = 'scripts';
+            scriptsSection.appendChild(scriptsTitle);
+            
+            // Find all cells and list their script files (single panel)
+            const root = document.querySelector('.main-content') || document;
+            const cells = root.querySelectorAll('.cell');
+            cells.forEach(cell => {
+                const header = cell.querySelector('.cell-header');
+                if (header) {
+                    const cellText = header.textContent.trim();
+                    const cellMatch = cellText.match(/Cell: ([a-zA-Z_][a-zA-Z0-9_]*)/);
+                    if (cellMatch) {
+                        const cellId = cellMatch[1];
+                        const scriptItem = document.createElement('div');
+                        scriptItem.className = 'file-explorer-item script';
+                        scriptItem.textContent = `${cellId}.py`;
+                        scriptItem.onclick = function() {
+                            cell.scrollIntoView({ behavior: 'smooth', block: 'start' });
+                        };
+                        scriptsSection.appendChild(scriptItem);
+                    }
+                }
+            });
+            
+            fileExplorer.appendChild(scriptsSection);
+            
+            // Artifacts section
+            const artifactsSection = document.createElement('div');
+            artifactsSection.className = 'file-explorer-section';
+            
+            const artifactsTitle = document.createElement('div');
+            artifactsTitle.className = 'file-explorer-section-title';
+            artifactsTitle.textContent = 'artifacts';
+            artifactsSection.appendChild(artifactsTitle);
+            
+            // Find all artifact links (single panel)
+            const artifactsRoot = document.querySelector('.main-content') || document;
+            const artifacts = artifactsRoot.querySelectorAll('.artifact');
+            if (artifacts.length === 0) {
+                const noArtifacts = document.createElement('div');
+                noArtifacts.className = 'file-explorer-item artifact';
+                noArtifacts.textContent = '(none)';
+                noArtifacts.style.opacity = '0.5';
+                artifactsSection.appendChild(noArtifacts);
+            } else {
+                artifacts.forEach(artifact => {
+                    const artifactItem = document.createElement('div');
+                    artifactItem.className = 'file-explorer-item artifact';
+                    artifactItem.textContent = artifact.textContent;
+                    artifactItem.onclick = function() {
+                        artifact.click();
+                    };
+                    artifactsSection.appendChild(artifactItem);
+                });
+            }
+            
+            fileExplorer.appendChild(artifactsSection);
+            
+            return fileExplorer;
+        }
+
+        function initStatusWidget() {
+            let el = document.querySelector('.status-widget');
+            if (!el) {
+                el = document.createElement('div');
+                el.className = 'status-widget';
+                el.id = 'status-widget';
+                el.textContent = 'ready — Esc';
+                document.body.appendChild(el);
+            }
+        }
+
+        // Tools widget
+        let _cursorX = 0;
+        let _cursorY = 0;
+        let _cursorVisible = false;
+        
+        function setActiveTool(tool) {
+            if (!tool || tool === 'none') {
+                document.body.dataset.tool = 'none';
+                localStorage.setItem('uvnote-active-tool', 'none');
+                setOverlayActive(false);
+                _cursorVisible = false;
+                // Remove active class from all tool buttons when deactivating
+                const toolButtons = document.querySelectorAll('.tools-widget .tool-button');
+                toolButtons.forEach(btn => btn.classList.remove('active'));
+                updateStateIndicator();
+                encodeToolStateToUrl();
+                return;
+            }
+            document.body.dataset.tool = tool;
+            localStorage.setItem('uvnote-active-tool', tool);
+            setOverlayActive(true);
+            _cursorVisible = true;
+            updateStateIndicator();
+            encodeToolStateToUrl();
+        }
+
+        // Make setActiveTool globally accessible for ESC key handler
+        window.setActiveTool = setActiveTool;
+        
+
+        function getArrowColor() {
+            const saved = localStorage.getItem('uvnote-arrow-color');
+            if (saved) return saved;
+            return '#e53935'; // Default red color
+        }
+
+        function setStoredArrowColor(color) {
+            try { localStorage.setItem('uvnote-arrow-color', color); } catch (_) {}
+        }
+
+        function getLineThickness() {
+            const saved = localStorage.getItem('uvnote-line-thickness');
+            if (saved) return parseInt(saved, 10);
+            return 6; // default thickness
+        }
+
+        function setStoredLineThickness(thickness) {
+            try { localStorage.setItem('uvnote-line-thickness', thickness); } catch (_) {}
+        }
+
+        function getFadeoutTime() {
+            const saved = localStorage.getItem('uvnote-fadeout-time');
+            if (saved) return parseInt(saved, 10);
+            return 5; // default 5 seconds
+        }
+
+        function setStoredFadeoutTime(seconds) {
+            try { localStorage.setItem('uvnote-fadeout-time', seconds); } catch (_) {}
+        }
+
+        function createToolsWidget() {
+            const tools = document.createElement('div');
+            tools.className = 'tools-widget';
+
+            const title = document.createElement('div');
+            title.className = 'tools-title';
+            title.textContent = 'tools';
+            tools.appendChild(title);
+
+            const row = document.createElement('div');
+            row.className = 'tools-row';
+            tools.appendChild(row);
+
+            // Arrow tool
+            const arrowBtn = document.createElement('div');
+            arrowBtn.className = 'tool-button';
+            arrowBtn.textContent = 'arrow';
+            arrowBtn.onclick = function() {
+                const isActive = arrowBtn.classList.contains('active');
+                if (isActive) {
+                    arrowBtn.classList.remove('active');
+                    setActiveTool('none');
+                } else {
+                    tools.querySelectorAll('.tool-button').forEach(b => b.classList.remove('active'));
+                    arrowBtn.classList.add('active');
+                    setActiveTool('arrow');
+                }
+            };
+            row.appendChild(arrowBtn);
+
+            // Pen tool
+            const penBtn = document.createElement('div');
+            penBtn.className = 'tool-button';
+            penBtn.textContent = 'pen';
+            penBtn.onclick = function() {
+                const isActive = penBtn.classList.contains('active');
+                if (isActive) {
+                    penBtn.classList.remove('active');
+                    setActiveTool('none');
+                } else {
+                    tools.querySelectorAll('.tool-button').forEach(b => b.classList.remove('active'));
+                    penBtn.classList.add('active');
+                    setActiveTool('pen');
+                }
+            };
+            row.appendChild(penBtn);
+
+            // Eraser tool
+            const eraseBtn = document.createElement('div');
+            eraseBtn.className = 'tool-button';
+            eraseBtn.textContent = 'eraser';
+            eraseBtn.onclick = function() {
+                const isActive = eraseBtn.classList.contains('active');
+                if (isActive) {
+                    eraseBtn.classList.remove('active');
+                    setActiveTool('none');
+                } else {
+                    tools.querySelectorAll('.tool-button').forEach(b => b.classList.remove('active'));
+                    eraseBtn.classList.add('active');
+                    setActiveTool('eraser');
+                }
+            };
+            row.appendChild(eraseBtn);
+
+            // Spotlight tool
+            const spotlightBtn = document.createElement('div');
+            spotlightBtn.className = 'tool-button';
+            spotlightBtn.textContent = 'spotlight';
+            spotlightBtn.onclick = function() {
+                const isActive = spotlightBtn.classList.contains('active');
+                if (isActive) {
+                    spotlightBtn.classList.remove('active');
+                    setActiveTool('none');
+                } else {
+                    tools.querySelectorAll('.tool-button').forEach(b => b.classList.remove('active'));
+                    spotlightBtn.classList.add('active');
+                    setActiveTool('spotlight');
+                }
+            };
+            row.appendChild(spotlightBtn);
+
+            // Clear all
+            const clearBtn = document.createElement('div');
+            clearBtn.className = 'tool-button';
+            clearBtn.textContent = 'clear';
+            clearBtn.onclick = function() {
+                _shapes = [];
+                saveShapes();
+                renderOverlay();
+            };
+            row.appendChild(clearBtn);
+            
+            // We'll add the copy button at the end of the widget
+
+            // Restore active state from storage
+            const saved = localStorage.getItem('uvnote-active-tool') || 'none';
+            if (saved === 'arrow') {
+                arrowBtn.classList.add('active');
+                setActiveTool('arrow');
+            } else if (saved === 'pen') {
+                penBtn.classList.add('active');
+                setActiveTool('pen');
+            } else if (saved === 'eraser') {
+                eraseBtn.classList.add('active');
+                setActiveTool('eraser');
+            } else if (saved === 'spotlight') {
+                spotlightBtn.classList.add('active');
+                setActiveTool('spotlight');
+            }
+
+            // Color selector
+            const colorTitle = document.createElement('div');
+            colorTitle.className = 'tools-section-title';
+            colorTitle.textContent = 'color';
+            tools.appendChild(colorTitle);
+
+            const colorRow = document.createElement('div');
+            colorRow.className = 'tools-row color-row';
+            tools.appendChild(colorRow);
+
+            const swatchColors = [
+                // Primary colors
+                '#e53935', '#fb8c00', '#fdd835', '#43a047', '#1e88e5', '#8e24aa',
+                // Additional useful colors  
+                '#ff5722', '#795548', '#607d8b', '#9c27b0',
+                // Grayscale
+                '#000000', '#424242', '#9e9e9e', '#ffffff'
+            ];
+            const swatches = [];
+            swatchColors.forEach(c => {
+                const s = document.createElement('div');
+                s.className = 'color-swatch';
+                s.style.backgroundColor = c;
+                s.title = c;
+                s.onclick = () => {
+                    setStoredArrowColor(c);
+                    refreshColorUI(c);
+                    if (_cursorVisible) renderOverlay();
+                    encodeToolStateToUrl();
+                };
+                colorRow.appendChild(s);
+                swatches.push(s);
+            });
+
+            const colorInput = document.createElement('input');
+            colorInput.type = 'color';
+            colorInput.className = 'color-input';
+            colorInput.oninput = () => {
+                setStoredArrowColor(colorInput.value);
+                refreshColorUI(colorInput.value);
+                if (_cursorVisible) renderOverlay();
+                encodeToolStateToUrl();
+            };
+            colorRow.appendChild(colorInput);
+
+            function refreshColorUI(selected) {
+                const selectedHex = selected.startsWith('#') ? selected.toLowerCase() : rgbToHex(selected);
+                
+                swatches.forEach((s, i) => {
+                    const swatchHex = swatchColors[i].toLowerCase();
+                    if (swatchHex === selectedHex) {
+                        s.classList.add('selected');
+                    } else {
+                        s.classList.remove('selected');
+                    }
+                });
+                
+                try { 
+                    colorInput.value = selectedHex; 
+                } catch (_) {}
+            }
+
+            function rgbToHex(rgb) {
+                const m = rgb.match(/rgba?\((\d+),\s*(\d+),\s*(\d+)\)/i);
+                if (!m) return '#000000';
+                const r = parseInt(m[1]).toString(16).padStart(2, '0');
+                const g = parseInt(m[2]).toString(16).padStart(2, '0');
+                const b = parseInt(m[3]).toString(16).padStart(2, '0');
+                return `#${r}${g}${b}`;
+            }
+
+            // Restore color selection
+            refreshColorUI(getArrowColor());
+
+            // Thickness slider
+            const thicknessTitle = document.createElement('div');
+            thicknessTitle.className = 'tools-section-title';
+            thicknessTitle.textContent = 'thickness';
+            tools.appendChild(thicknessTitle);
+
+            const thicknessRow = document.createElement('div');
+            thicknessRow.className = 'thickness-row';
+            tools.appendChild(thicknessRow);
+
+            const thicknessSlider = document.createElement('input');
+            thicknessSlider.type = 'range';
+            thicknessSlider.className = 'thickness-slider';
+            thicknessSlider.min = '1';
+            thicknessSlider.max = '10';
+            thicknessSlider.value = getLineThickness();
+            
+            const thicknessValue = document.createElement('span');
+            thicknessValue.className = 'thickness-value';
+            thicknessValue.textContent = thicknessSlider.value + 'px';
+
+            thicknessSlider.oninput = function() {
+                const value = parseInt(thicknessSlider.value, 10);
+                setStoredLineThickness(value);
+                thicknessValue.textContent = value + 'px';
+                if (_cursorVisible) renderOverlay();
+                encodeToolStateToUrl();
+            };
+
+            thicknessRow.appendChild(thicknessSlider);
+            thicknessRow.appendChild(thicknessValue);
+
+            // Fadeout time slider
+            const fadeoutTitle = document.createElement('div');
+            fadeoutTitle.className = 'tools-section-title';
+            fadeoutTitle.textContent = 'fadeout time';
+            tools.appendChild(fadeoutTitle);
+
+            const fadeoutRow = document.createElement('div');
+            fadeoutRow.className = 'thickness-row';
+            tools.appendChild(fadeoutRow);
+
+            const fadeoutSlider = document.createElement('input');
+            fadeoutSlider.type = 'range';
+            fadeoutSlider.className = 'thickness-slider';
+            fadeoutSlider.min = '0';
+            fadeoutSlider.max = '30';
+            fadeoutSlider.value = getFadeoutTime();
+            
+            const fadeoutValue = document.createElement('span');
+            fadeoutValue.className = 'thickness-value';
+            fadeoutValue.textContent = fadeoutSlider.value === '0' ? 'never' : fadeoutSlider.value + 's';
+
+            fadeoutSlider.oninput = function() {
+                const value = parseInt(fadeoutSlider.value, 10);
+                setStoredFadeoutTime(value);
+                fadeoutValue.textContent = value === 0 ? 'never' : value + 's';
+                encodeToolStateToUrl();
+            };
+
+            fadeoutRow.appendChild(fadeoutSlider);
+            fadeoutRow.appendChild(fadeoutValue);
+
+            // Draggable behavior
+            makeDraggable(tools, 'uvnote-tools-pos', title);
+
+            return tools;
+        }
+
+        function initTools() {
+            const widget = createToolsWidget();
+            document.body.appendChild(widget);
+        }
+
+        function teardownTools() {
+            const w = document.querySelector('.tools-widget');
+            if (w && w.parentNode) w.parentNode.removeChild(w);
+        }
+
+        // --- Canvas overlay for tools ---
+        let _overlay = null;
+        let _overlayCtx = null;
+        let _overlayContainer = null; // window
+        let _overlayMode = 'single';
+        let _overlayResizeHandler = null;
+        let _overlayScrollHandler = null;
+        let _drawing = null; // current in-progress arrow {x1,y1,x2,y2}
+        let _shapes = []; // committed shapes for current mode
+        let _fadeTimer = null; // timer for fade animation
+        let _urlLoadedTool = false; // track if tool was loaded from URL
+        let _isInitializing = true; // prevent URL updates during initialization
+        let _initialCellStates = {}; // track initial cell states from page load
+
+        function getOverlayStorageKey() { return 'uvnote-shapes'; }
+
+        function loadShapes() {
+            try {
+                const raw = localStorage.getItem(getOverlayStorageKey());
+                _shapes = raw ? JSON.parse(raw) : [];
+            } catch (_) { _shapes = []; }
+        }
+
+        function saveShapes() {
+            try { 
+                localStorage.setItem(getOverlayStorageKey(), JSON.stringify(_shapes));
+                // Always update URL when shapes change
+                encodeToolStateToUrl();
+            } catch (_) {}
+        }
+
+        function updateShapesFade() {
+            const now = Date.now();
+            let needsUpdate = false;
+
+            for (let i = _shapes.length - 1; i >= 0; i--) {
+                const shape = _shapes[i];
+                if (!shape.createdAt) continue; // Skip old shapes without timestamps
+                
+                // Use individual shape's fadeout time, or global if not set
+                const shapesFadeoutSeconds = shape.fadeoutTime !== undefined ? shape.fadeoutTime : getFadeoutTime();
+                
+                // Skip fading if fadeout is disabled for this shape
+                if (shapesFadeoutSeconds === 0) continue;
+                
+                const fadeStartTime = Math.max(0, (shapesFadeoutSeconds - 2) * 1000); // Start fading 2s before end
+                const fadeEndTime = shapesFadeoutSeconds * 1000; // Fully gone after specified time
+                const age = now - shape.createdAt;
+                
+                if (age >= fadeEndTime) {
+                    // Remove completely faded shapes
+                    _shapes.splice(i, 1);
+                    needsUpdate = true;
+                } else if (age >= fadeStartTime) {
+                    // Update opacity for fading shapes
+                    const fadeProgress = (age - fadeStartTime) / (fadeEndTime - fadeStartTime);
+                    const newOpacity = 1 - fadeProgress;
+                    if (Math.abs(shape.opacity - newOpacity) > 0.01) {
+                        shape.opacity = newOpacity;
+                        needsUpdate = true;
+                    }
+                }
+            }
+
+            if (needsUpdate) {
+                saveShapes();
+                renderOverlay();
+                // Update URL to remove faded shapes  
+                encodeToolStateToUrl();
+            }
+        }
+
+        function getContentContainer() { return window; }
+
+        function updateOverlayModeAndContainer() {
+            _overlayContainer = window;
+            _overlayMode = 'single';
+        }
+
+        function updateOverlayBounds() {
+            if (!_overlay) return;
+            if (_overlayContainer === window) {
+                _overlay.style.position = 'fixed';
+                _overlay.style.left = '0px';
+                _overlay.style.top = '0px';
+                _overlay.width = window.innerWidth;
+                _overlay.height = window.innerHeight;
+            } else {
+                const rect = _overlayContainer.getBoundingClientRect();
+                _overlay.style.position = 'fixed';
+                _overlay.style.left = rect.left + 'px';
+                _overlay.style.top = rect.top + 'px';
+                _overlay.width = Math.max(0, Math.floor(rect.width));
+                _overlay.height = Math.max(0, Math.floor(rect.height));
+            }
+            renderOverlay();
+        }
+
+        function containerScrollLeft() {
+            return (_overlayContainer === window) ? (window.scrollX || 0) : (_overlayContainer.scrollLeft || 0);
+        }
+        function containerScrollTop() {
+            return (_overlayContainer === window) ? (window.scrollY || 0) : (_overlayContainer.scrollTop || 0);
+        }
+
+        function toCanvasCoords(clientX, clientY) {
+            const rect = _overlay.getBoundingClientRect();
+            return { x: clientX - rect.left, y: clientY - rect.top };
+        }
+
+        function onPointerDown(e) {
+            const tool = document.body.dataset.tool;
+            if (tool === 'arrow') {
+                startDrawArrow(e);
+            } else if (tool === 'pen') {
+                startDrawPen(e);
+            } else if (tool === 'eraser') {
+                eraseAt(e);
+            } else if (tool === 'spotlight') {
+                startDrawSpotlight(e);
+            }
+        }
+
+        function onPointerMove(e) {
+            // Update cursor position
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _cursorX = pt.x;
+            _cursorY = pt.y;
+            
+            if (!_drawing) {
+                // Just update cursor position and re-render
+                if (_cursorVisible) {
+                    renderOverlay();
+                }
+                return;
+            }
+            
+            if (_drawing.type === 'pen') {
+                moveDrawPen(e);
+            } else if (_drawing.type === 'spotlight') {
+                moveDrawSpotlight(e);
+            } else {
+                moveDrawArrow(e);
+            }
+        }
+        
+        function onPointerEnter(e) {
+            _cursorVisible = document.body.dataset.tool !== 'none';
+            if (_cursorVisible) {
+                renderOverlay();
+            }
+        }
+        
+        function onPointerLeave(e) {
+            _cursorVisible = false;
+            renderOverlay();
+        }
+
+        function onPointerUp(e) {
+            if (!_drawing) return;
+            if (_drawing.type === 'pen') {
+                endDrawPen();
+            } else if (_drawing.type === 'spotlight') {
+                endDrawSpotlight();
+            } else {
+                endDrawArrow();
+            }
+        }
+
+        function startDrawArrow(e) {
+            if (document.body.dataset.tool !== 'arrow') return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _drawing = {
+                x1: pt.x + containerScrollLeft(),
+                y1: pt.y + containerScrollTop(),
+                x2: pt.x + containerScrollLeft(),
+                y2: pt.y + containerScrollTop(),
+                color: getArrowColor(),
+                width: getLineThickness()
+            };
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function moveDrawArrow(e) {
+            if (!_drawing) return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _drawing.x2 = pt.x + containerScrollLeft();
+            _drawing.y2 = pt.y + containerScrollTop();
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function endDrawArrow() {
+            if (!_drawing) return;
+            _shapes.push({ 
+                type: 'arrow', 
+                ..._drawing,
+                createdAt: Date.now(),
+                fadeoutTime: getFadeoutTime(),
+                opacity: 1.0
+            });
+            _drawing = null;
+            saveShapes();
+            renderOverlay();
+        }
+
+        function startDrawPen(e) {
+            if (document.body.dataset.tool !== 'pen') return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _drawing = {
+                type: 'pen',
+                points: [{
+                    x: pt.x + containerScrollLeft(),
+                    y: pt.y + containerScrollTop()
+                }],
+                color: getArrowColor(),
+                width: getLineThickness()
+            };
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function moveDrawPen(e) {
+            if (!_drawing || _drawing.type !== 'pen') return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _drawing.points.push({
+                x: pt.x + containerScrollLeft(),
+                y: pt.y + containerScrollTop()
+            });
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function endDrawPen() {
+            if (!_drawing || _drawing.type !== 'pen') return;
+            if (_drawing.points.length > 1) {
+                _shapes.push({ 
+                    ..._drawing,
+                    createdAt: Date.now(),
+                    fadeoutTime: getFadeoutTime(),
+                    opacity: 1.0
+                });
+            }
+            _drawing = null;
+            saveShapes();
+            renderOverlay();
+        }
+
+        function startDrawSpotlight(e) {
+            if (document.body.dataset.tool !== 'spotlight') return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _drawing = {
+                type: 'spotlight',
+                x: pt.x + containerScrollLeft(),
+                y: pt.y + containerScrollTop(),
+                radius: getLineThickness() * 20, // Use thickness to control spotlight size (bigger default)
+                color: getArrowColor()
+            };
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function moveDrawSpotlight(e) {
+            if (!_drawing || _drawing.type !== 'spotlight') return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            const dx = pt.x + containerScrollLeft() - _drawing.x;
+            const dy = pt.y + containerScrollTop() - _drawing.y;
+            _drawing.radius = Math.max(20, Math.sqrt(dx * dx + dy * dy)); // Minimum radius of 20
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function endDrawSpotlight() {
+            if (!_drawing || _drawing.type !== 'spotlight') return;
+            _shapes.push({ 
+                ..._drawing,
+                createdAt: Date.now(),
+                fadeoutTime: getFadeoutTime(),
+                opacity: 1.0
+            });
+            _drawing = null;
+            saveShapes();
+            renderOverlay();
+        }
+
+        function distPointToSegment(px, py, x1, y1, x2, y2) {
+            const dx = x2 - x1, dy = y2 - y1;
+            if (dx === 0 && dy === 0) return Math.hypot(px - x1, py - y1);
+            const t = Math.max(0, Math.min(1, ((px - x1) * dx + (py - y1) * dy) / (dx*dx + dy*dy)));
+            const cx = x1 + t * dx, cy = y1 + t * dy;
+            return Math.hypot(px - cx, py - cy);
+        }
+
+        function eraseAt(e) {
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            const x = pt.x + containerScrollLeft();
+            const y = pt.y + containerScrollTop();
+            const threshold = 10; // pixels
+            for (let i = _shapes.length - 1; i >= 0; i--) {
+                const s = _shapes[i];
+                if (s.type === 'arrow') {
+                    const d = distPointToSegment(x, y, s.x1, s.y1, s.x2, s.y2);
+                    if (d <= threshold) {
+                        _shapes.splice(i, 1);
+                        saveShapes();
+                        renderOverlay();
+                        break;
+                    }
+                } else if (s.type === 'pen' && s.points) {
+                    // Check if click is near any line segment in the pen stroke
+                    let minDist = Infinity;
+                    for (let j = 1; j < s.points.length; j++) {
+                        const d = distPointToSegment(x, y, s.points[j-1].x, s.points[j-1].y, s.points[j].x, s.points[j].y);
+                        minDist = Math.min(minDist, d);
+                    }
+                    if (minDist <= threshold) {
+                        _shapes.splice(i, 1);
+                        saveShapes();
+                        renderOverlay();
+                        break;
+                    }
+                }
+            }
+            e.preventDefault();
+        }
+
+        function drawArrow(ctx, x1, y1, x2, y2, color, width, opacity = 1.0) {
+            // Set opacity
+            const oldAlpha = ctx.globalAlpha;
+            ctx.globalAlpha = opacity;
+            
+            ctx.strokeStyle = color;
+            ctx.fillStyle = color;
+            ctx.lineWidth = width;
+            ctx.lineCap = 'round';
+            ctx.lineJoin = 'round';
+            
+            // Check if points are too close (initial state)
+            const dx = x2 - x1;
+            const dy = y2 - y1;
+            const distance = Math.sqrt(dx * dx + dy * dy);
+            
+            if (distance < 5) {
+                // Draw just a small arrowhead pointing down-right when first clicked
+                const defaultAngle = Math.PI / 4; // 45 degrees (down-right)
+                const headLength = Math.min(15 + width * 1.5, 25);
+                const headAngle = Math.PI / 6;
+                
+                // Calculate arrowhead points
+                const hx1 = x1 + headLength * Math.cos(defaultAngle - headAngle);
+                const hy1 = y1 + headLength * Math.sin(defaultAngle - headAngle);
+                const hx2 = x1 + headLength * Math.cos(defaultAngle + headAngle);
+                const hy2 = y1 + headLength * Math.sin(defaultAngle + headAngle);
+                
+                // Draw arrowhead only
+                ctx.beginPath();
+                ctx.moveTo(x1, y1);
+                ctx.lineTo(hx1, hy1);
+                ctx.lineTo(hx2, hy2);
+                ctx.closePath();
+                ctx.fill();
+            } else {
+                // Normal arrow drawing - head at x1,y1, tail at x2,y2
+                const angle = Math.atan2(y1 - y2, x1 - x2);
+                const headLength = Math.min(15 + width * 1.5, 25);
+                const headAngle = Math.PI / 6;
+                
+                // Calculate where the line should end (before the arrowhead)
+                const lineEndX = x1 - headLength * 0.8 * Math.cos(angle);
+                const lineEndY = y1 - headLength * 0.8 * Math.sin(angle);
+                
+                // Draw the line from tail to near the head
+                ctx.beginPath();
+                ctx.moveTo(x2, y2);
+                ctx.lineTo(lineEndX, lineEndY);
+                ctx.stroke();
+                
+                // Calculate arrowhead points
+                const hx1 = x1 - headLength * Math.cos(angle - headAngle);
+                const hy1 = y1 - headLength * Math.sin(angle - headAngle);
+                const hx2 = x1 - headLength * Math.cos(angle + headAngle);
+                const hy2 = y1 - headLength * Math.sin(angle + headAngle);
+                
+                // Draw arrowhead
+                ctx.beginPath();
+                ctx.moveTo(x1, y1);
+                ctx.lineTo(hx1, hy1);
+                ctx.lineTo(hx2, hy2);
+                ctx.closePath();
+                ctx.fill();
+            }
+            
+            // Restore opacity
+            ctx.globalAlpha = oldAlpha;
+        }
+
+        function drawPen(ctx, points, color, width, offX, offY, opacity = 1.0) {
+            if (!points || points.length < 2) return;
+            
+            // Set opacity
+            const oldAlpha = ctx.globalAlpha;
+            ctx.globalAlpha = opacity;
+            
+            ctx.strokeStyle = color;
+            ctx.lineWidth = width;
+            ctx.lineCap = 'round';
+            ctx.lineJoin = 'round';
+            ctx.beginPath();
+            ctx.moveTo(points[0].x - offX, points[0].y - offY);
+            for (let i = 1; i < points.length; i++) {
+                ctx.lineTo(points[i].x - offX, points[i].y - offY);
+            }
+            ctx.stroke();
+            
+            // Restore opacity
+            ctx.globalAlpha = oldAlpha;
+        }
+
+        function drawAllSpotlights(ctx, spotlights, offX, offY) {
+            if (!spotlights || spotlights.length === 0) return;
+            
+            ctx.save();
+            
+            // Calculate the overall opacity based on all spotlights
+            const maxOpacity = Math.max(...spotlights.map(s => s.opacity || 1.0));
+            
+            // Fill entire canvas with dark overlay
+            ctx.fillStyle = `rgba(0, 0, 0, ${0.7 * maxOpacity})`;
+            ctx.fillRect(0, 0, ctx.canvas.width, ctx.canvas.height);
+            
+            // Cut out completely transparent holes for all spotlights
+            ctx.globalCompositeOperation = 'destination-out';
+            ctx.fillStyle = 'rgba(0, 0, 0, 1)'; // Solid black to ensure complete removal
+            for (const spotlight of spotlights) {
+                ctx.beginPath();
+                ctx.arc(spotlight.x - offX, spotlight.y - offY, spotlight.radius, 0, 2 * Math.PI);
+                ctx.fill();
+            }
+            
+            ctx.restore();
+        }
+
+        function renderOverlay() {
+            if (!_overlay || !_overlayCtx) return;
+            _overlayCtx.clearRect(0, 0, _overlay.width, _overlay.height);
+            const offX = containerScrollLeft();
+            const offY = containerScrollTop();
+            // Draw non-spotlight shapes first
+            for (const s of _shapes) {
+                const opacity = s.opacity !== undefined ? s.opacity : 1.0;
+                if (s.type === 'arrow') {
+                    drawArrow(_overlayCtx, s.x1 - offX, s.y1 - offY, s.x2 - offX, s.y2 - offY, s.color || '#f00', s.width || 2, opacity);
+                } else if (s.type === 'pen') {
+                    drawPen(_overlayCtx, s.points, s.color || '#f00', s.width || 2, offX, offY, opacity);
+                }
+            }
+            // Draw current drawing (non-spotlight)
+            if (_drawing) {
+                if (_drawing.type === 'pen') {
+                    drawPen(_overlayCtx, _drawing.points, _drawing.color, _drawing.width, offX, offY);
+                } else if (_drawing.type !== 'spotlight') {
+                    drawArrow(_overlayCtx, _drawing.x1 - offX, _drawing.y1 - offY, _drawing.x2 - offX, _drawing.y2 - offY, _drawing.color, _drawing.width);
+                }
+            }
+            
+            // Collect all spotlights (existing + current drawing + cursor preview)
+            const spotlights = [];
+            
+            // Add existing spotlight shapes
+            for (const s of _shapes) {
+                if (s.type === 'spotlight') {
+                    spotlights.push({
+                        x: s.x,
+                        y: s.y, 
+                        radius: s.radius,
+                        opacity: s.opacity !== undefined ? s.opacity : 1.0
+                    });
+                }
+            }
+            
+            // Add current spotlight being drawn
+            if (_drawing && _drawing.type === 'spotlight') {
+                spotlights.push({
+                    x: _drawing.x,
+                    y: _drawing.y,
+                    radius: _drawing.radius,
+                    opacity: 1.0
+                });
+            }
+            
+            // Add cursor preview spotlight if tool is active
+            if (_cursorVisible && !_drawing) {
+                const tool = document.body.dataset.tool;
+                if (tool === 'spotlight') {
+                    const thickness = getLineThickness();
+                    const radius = thickness * 20;
+                    const cursorWorldX = _cursorX + containerScrollLeft();
+                    const cursorWorldY = _cursorY + containerScrollTop();
+                    spotlights.push({
+                        x: cursorWorldX,
+                        y: cursorWorldY,
+                        radius: radius,
+                        opacity: 0.8
+                    });
+                }
+            }
+            
+            // Draw all spotlights as a single overlay with multiple holes
+            drawAllSpotlights(_overlayCtx, spotlights, offX, offY);
+            
+            // Draw cursor indicators for non-spotlight tools
+            if (_cursorVisible && !_drawing) {
+                const tool = document.body.dataset.tool;
+                const color = getArrowColor();
+                const thickness = getLineThickness();
+                
+                if (tool !== 'spotlight') {
+                    _overlayCtx.save();
+                    _overlayCtx.fillStyle = color;
+                    _overlayCtx.globalAlpha = 0.7;
+                    
+                    if (tool === 'eraser') {
+                        // Draw eraser indicator
+                        _overlayCtx.strokeStyle = color;
+                        _overlayCtx.lineWidth = 2;
+                        _overlayCtx.beginPath();
+                        _overlayCtx.arc(_cursorX, _cursorY, 10, 0, 2 * Math.PI);
+                        _overlayCtx.stroke();
+                    } else {
+                        // Draw dot for pen/arrow
+                        _overlayCtx.beginPath();
+                        _overlayCtx.arc(_cursorX, _cursorY, thickness / 2, 0, 2 * Math.PI);
+                        _overlayCtx.fill();
+                    }
+                    
+                    _overlayCtx.restore();
+                }
+            }
+        }
+
+        function setOverlayActive(active) {
+            if (!_overlay) initOverlay();
+            _overlay.style.pointerEvents = active ? 'auto' : 'none';
+            _overlay.style.cursor = active ? 'none' : 'auto';
+            // Re-render to ensure visibility aligns with content
+            renderOverlay();
+        }
+
+        function initOverlay() {
+            if (_overlay) return;
+            updateOverlayModeAndContainer();
+            _overlay = document.createElement('canvas');
+            _overlay.className = 'draw-overlay';
+            _overlayCtx = _overlay.getContext('2d');
+            document.body.appendChild(_overlay);
+            updateOverlayBounds();
+            loadShapes();
+            renderOverlay();
+
+            // Events
+            _overlay.addEventListener('mousedown', onPointerDown);
+            _overlay.addEventListener('mousemove', onPointerMove);
+            _overlay.addEventListener('mouseenter', onPointerEnter);
+            _overlay.addEventListener('mouseleave', onPointerLeave);
+            document.addEventListener('mouseup', onPointerUp);
+            _overlay.addEventListener('touchstart', onPointerDown, { passive: false });
+            _overlay.addEventListener('touchmove', onPointerMove, { passive: false });
+            document.addEventListener('touchend', onPointerUp);
+
+            _overlayResizeHandler = () => updateOverlayBounds();
+            window.addEventListener('resize', _overlayResizeHandler);
+
+            _overlayScrollHandler = () => renderOverlay();
+            window.addEventListener('scroll', _overlayScrollHandler);
+            
+            // Start fade animation timer
+            _fadeTimer = setInterval(updateShapesFade, 100); // Update every 100ms for smooth fade
+        }
+
+        function rebindOverlayContainer() {
+            if (!_overlay) return;
+            // Remove old scroll handler
+            if (_overlayScrollHandler) { window.removeEventListener('scroll', _overlayScrollHandler); }
+            updateOverlayModeAndContainer();
+            updateOverlayBounds();
+            loadShapes();
+            renderOverlay();
+            _overlayScrollHandler = () => renderOverlay();
+            window.addEventListener('scroll', _overlayScrollHandler);
+        }
+
+        function teardownOverlay() {
+            if (!_overlay) return;
+            _overlay.removeEventListener('mousedown', onPointerDown);
+            _overlay.removeEventListener('mousemove', onPointerMove);
+            _overlay.removeEventListener('mouseenter', onPointerEnter);
+            _overlay.removeEventListener('mouseleave', onPointerLeave);
+            document.removeEventListener('mouseup', onPointerUp);
+            _overlay.removeEventListener('touchstart', onPointerDown);
+            _overlay.removeEventListener('touchmove', onPointerMove);
+            document.removeEventListener('touchend', onPointerUp);
+            if (_overlayResizeHandler) window.removeEventListener('resize', _overlayResizeHandler);
+            if (_overlayScrollHandler) {
+                if (_overlayContainer === window) {
+                    window.removeEventListener('scroll', _overlayScrollHandler);
+                } else if (_overlayContainer) {
+                    _overlayContainer.removeEventListener('scroll', _overlayScrollHandler);
+                }
+            }
+            if (_fadeTimer) {
+                clearInterval(_fadeTimer);
+                _fadeTimer = null;
+            }
+            if (_overlay.parentNode) _overlay.parentNode.removeChild(_overlay);
+            _overlay = null; _overlayCtx = null; _overlayContainer = null; _overlayResizeHandler = null; _overlayScrollHandler = null; _drawing = null;
+        }
+        
+        function teardownFileExplorer() {
+            const fe = document.querySelector('.file-explorer');
+            if (fe && fe.parentNode) fe.parentNode.removeChild(fe);
+        }
+
+        function escapeHtml(text) {
+            const div = document.createElement('div');
+            div.textContent = text;
+            return div.innerHTML;
+        }
+
+        function runCell(cellId){
+            const btn=document.querySelector('.run-btn[onclick*="'+cellId+'"]');
+            const output=document.getElementById('output-'+cellId);
+            if(btn){btn.textContent='⏳ running...';btn.disabled=true;}
+            if(output){output.classList.add('output-stale');}
+            fetch('/run/'+cellId,{method:'POST'}).then(r=>r.json()).then(data=>{
+                if(output){
+                    output.classList.remove('output-stale');
+                    let html='';
+                    if (data.stdout) {
+                    html += '<div class="cell-stdout"><pre class="stdout-text">'
+                        + escapeHtml(data.stdout)
+                        + '</pre></div>';
+                    }
+
+                    console.log('UV Logs:', data);
+                    if(data.stderr) {
+                        // Split UV logs from regular stderr
+                        const lines = data.stderr.split('\\n');
+                        let uvLogs = [];
+                        let regularLogs = [];
+                        let inUvSection = true;
+
+                        for (const line of lines) {
+                            if (inUvSection) {
+                                uvLogs.push(line);
+                                if (line.startsWith('Installed ')) {
+                                    inUvSection = false;
+                                }
+                            } else {
+                                regularLogs.push(line);
+                            }
+                        }
+                        
+
+                        // If we never found "Installed", treat it all as regular stderr
+                        if (inUvSection) {
+                            html+='<div class="cell-stderr">'+escapeHtml(data.stderr)+'</div>';
+                        } else {
+                            const uvLogsStr = uvLogs.join('\\n');
+                            const regularLogsStr = regularLogs.join('\\n').trim();
+
+                            if (uvLogsStr) {
+                                html+='<div class="uv-install-logs">';
+                                html+='<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>';
+                                html+='<div class="uv-logs-content" style="display: none;">'+escapeHtml(uvLogsStr)+'</div>';
+                                html+='</div>';
+                            }
+                            if (regularLogsStr) {
+                                html+='<div class="cell-stderr">'+escapeHtml(regularLogsStr)+'</div>';
+                            }
+                        }
+                    }
+                    output.innerHTML=html;
+                }
+                if(btn){btn.textContent='▶ run';btn.disabled=false;}
+            }).catch(e=>{
+                console.error('Run failed:',e);
+                if(output){output.classList.remove('output-stale');}
+                if(btn){btn.textContent='▶ run';btn.disabled=false;}
+            });
+        }
+
+        function copyCell(cellId){
+            // Try multiple selectors to find the code element
+            // Pygments generates .highlight > pre with spans, not wrapped in <code>
+            let codeElement = document.querySelector('#code-'+cellId+' .highlight pre');
+            if (!codeElement) {
+                codeElement = document.querySelector('#code-'+cellId+' pre');
+            }
+            if (!codeElement) {
+                codeElement = document.querySelector('#code-'+cellId+' code');
+            }
+            if (!codeElement) {
+                // Fallback to the code div itself
+                codeElement = document.getElementById('code-'+cellId);
+            }
+
+            const btn = document.querySelector('.copy-btn[onclick*="'+cellId+'"]');
+
+            if (!codeElement) {
+                console.error('Code element not found for cell:', cellId);
+                return;
+            }
+            if (!btn) {
+                console.error('Copy button not found for cell:', cellId);
+                return;
+            }
+
+            const codeText = codeElement.textContent;
+            
+            if (navigator.clipboard && navigator.clipboard.writeText) {
+                navigator.clipboard.writeText(codeText).then(function() {
+                    console.log('Clipboard copy successful');
+                    btn.textContent = '✓ Copied!';
+                    btn.classList.add('copied');
+                    setTimeout(function() {
+                        btn.textContent = 'Copy';
+                        btn.classList.remove('copied');
+                    }, 2000);
+                }).catch(function(err) {
+                    console.warn('Clipboard copy failed:', err);
+                    fallbackCopy();
+                });
+            } else {
+                console.log('Using fallback copy method');
+                fallbackCopy();
+            }
+            
+            function fallbackCopy() {
+                const textarea = document.createElement('textarea');
+                textarea.value = codeText;
+                textarea.style.position = 'absolute';
+                textarea.style.left = '-9999px';
+                document.body.appendChild(textarea);
+                textarea.select();
+                try {
+                    const success = document.execCommand('copy');
+                    console.log('Fallback copy success:', success);
+                    btn.textContent = '✓ Copied!';
+                    btn.classList.add('copied');
+                    setTimeout(function() {
+                        btn.textContent = 'Copy';
+                        btn.classList.remove('copied');
+                    }, 2000);
+                } catch (err) {
+                    console.error('Fallback copy failed:', err);
+                    btn.textContent = 'Copy failed';
+                    setTimeout(function() {
+                        btn.textContent = 'Copy';
+                    }, 2000);
+                }
+                document.body.removeChild(textarea);
+            }
+        }
+
+        // // Live reload functionality (robust SSE handling)
+        // (function(){
+        //     if (!('EventSource' in window)) {
+        //         console.warn('SSE not supported in this browser');
+        //         return;
+        //     }
+        //     let source = new EventSource('/events');
+        //     let isOpen = false;
+        //     source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
+        //     source.onmessage = function(e){
+        //         const msg=(e.data||'').trim(); if(!msg) return;
+        //         console.log('SSE message:', msg);
+        //         if (msg==='reload' || msg==='incremental') { location.reload(); }
+        //         // Ignore 'loading' to avoid premature reload loops
+        //     };
+        //     source.onerror = function(e){
+        //         // Let EventSource auto-reconnect instead of forcing a reload
+        //         if (isOpen) console.warn('SSE error after open, retrying...', e);
+        //     };
+        //     window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
+        // })();
+
+
+        document.addEventListener('DOMContentLoaded', function() {
+            // Capture initial cell states before any modifications
+            captureInitialCellStates();
+            
+            updateThemeIcon();
+            updateUiMenu();
+            updateUiDebug();
+            const widgetsEnabled = (document.documentElement.getAttribute('data-widgets') || 'on') === 'on';
+            if (widgetsEnabled) {
+            initMinimap();
+            initFileExplorer();
+            initTools();
+            initOverlay();
+            initStatusWidget();
+            initializeWidgetVisibility();
+            layoutWidgetsStackedBottomRight();
+            window.addEventListener('resize', layoutWidgetsStackedBottomRight);
+            }
+
+            // Apply deep-link selection if present
+            applyLocationFromUrl();
+            updateStateIndicator();
+            
+            // Apply cell states from URL immediately
+            const url = new URL(window.location.href);
+            const encodedCellStates = url.searchParams.get('cells');
+            if (encodedCellStates) {
+                console.log('Applying cell states from URL...');
+                const cellStates = decodeCellStatesFromUrl(encodedCellStates);
+                
+                // Use requestAnimationFrame to ensure DOM is ready
+                requestAnimationFrame(() => {
+                    applyCellStatesFromUrl(cellStates);
+                    
+                    // Clear initialization flag after cell states are applied
+                    if (typeof _isInitializing !== 'undefined') {
+                        _isInitializing = false;
+                    }
+                });
+            } else {
+                // Clear initialization flag even if no cell states
+                if (typeof _isInitializing !== 'undefined') {
+                    requestAnimationFrame(() => {
+                        _isInitializing = false;
+                    });
+                }
+            }
+
+            // Bind drag selection on line numbers
+            document.addEventListener('mousedown', onLineNumberMouseDown);
+            document.addEventListener('mousemove', onDocMouseMove);
+            document.addEventListener('mouseup', onDocMouseUp);
+
+            // Add ESC key handler to exit tools
+            document.addEventListener('keydown', function(e) {
+                if (e.key === 'Escape' || e.keyCode === 27) {
+                    const currentTool = document.body.dataset.tool;
+                    if (currentTool && currentTool !== 'none') {
+                        // Deactivate the current tool
+                        window.setActiveTool('none');
+                    }
+                    // Also clear any active line selection
+                    clearSelection(true);
+                }
+            });
+        });
+
+        function updateStateIndicator() {
+            try {
+                const el = document.getElementById('status-widget');
+                if (!el) return;
+                const tool = document.body.dataset.tool || 'none';
+                if (tool && tool !== 'none') {
+                    el.textContent = `tool: ${tool} — Esc`;
+                    return;
+                }
+                if (_selection) {
+                    const t = _selection.a === _selection.b ? `L${_selection.a}` : `L${_selection.a}-${_selection.b}`;
+                    el.textContent = `selected: ${t} — Esc`;
+                    return;
+                }
+                el.textContent = 'ready — Esc';
+            } catch (_) {}
+        }
+    </script>
+</head>
+
+
+<body>
+    <div class="controls">
+        <div class="controls-buttons">
+            
+            <a href="index.html" class="back-button">← back</a>
+            
+            <div class="theme-toggle" onclick="toggleTheme()">light</div>
+            <div class="reset-toggle" onclick="resetLayout()">reset</div>
+            <div class="menu-button" onclick="toggleMenu()">
+                menu ▼
+                <div class="menu-dropdown">
+                    <div class="menu-item" onclick="setUiTheme('default')">
+                        <span class="menu-checkbox" id="checkbox-ui-default">☑</span> Theme: default
+                    </div>
+                    <div class="menu-item" onclick="setUiTheme('none')">
+                        <span class="menu-checkbox" id="checkbox-ui-none">☐</span> Theme: none
+                    </div>
+                    <div class="menu-item" onclick="setUiTheme('monocolor')">
+                        <span class="menu-checkbox" id="checkbox-ui-monocolor">☐</span> Theme: monocolor
+                    </div>
+                    <div class="menu-item" onclick="toggleWidget('tools')">
+                        <span class="menu-checkbox" id="checkbox-tools">☐</span> Tools
+                    </div>
+                    <div class="menu-item" onclick="toggleWidget('file-explorer')">
+                        <span class="menu-checkbox" id="checkbox-file-explorer">☐</span> File Explorer
+                    </div>
+                    <div class="menu-item" onclick="toggleWidget('minimap')">
+                        <span class="menu-checkbox" id="checkbox-minimap">☐</span> Table of Contents
+                    </div>
+                    <div class="menu-item" onclick="toggleWidget('status')">
+                        <span class="menu-checkbox" id="checkbox-status">☑</span> Status Indicator
+                    </div>
+                </div>
+            </div>
+        </div>
+    </div>
+    
+    <div class="system-info">
+        <div class="system-info-header">Generated on:</div>
+        <div class="system-info-content">
+            Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
+        </div>
+    </div>
+    
+    <div class="main-content">
+        <h1>HF Kernels - Deformable DETR</h1>
+<h2>GPU Info</h2>
+<div class="cell" id="cell-nv">
+<div class="cell-header">
+<span class="collapse-indicators">
+<span onclick="toggleCode('nv')" style="cursor: pointer;">▼ code</span> 
+<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
+ <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
+</span> | 
+Cell: nv | 0.23s
+ | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
+<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
+<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
+<a href="https://github.com/huggingface/kernels-benchmarks/blob/main/deformable_detr/impls/hf_kernels_deformable_detr.md" target="_blank" class="github-btn">GitHub</a>
+<a href="https://huggingface.co/kernels-community/deformable-detr" target="_blank" class="hf-btn">🤗 HF</a>
+</div>
+<div id="code-nv" class="cell-code" data-lines="2">
+<div class="code-wrap">
+<div class="highlight"><pre><span></span><span class="kn">import</span><span class="w"> </span><span class="nn">subprocess</span>
+<span class="nb">print</span><span class="p">(</span><span class="n">subprocess</span><span class="o">.</span><span class="n">run</span><span class="p">([</span><span class="s2">&quot;nvidia-smi&quot;</span><span class="p">],</span> <span class="n">capture_output</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">text</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span><span class="o">.</span><span class="n">stdout</span><span class="p">)</span>
+</pre></div>
+
+<div class="code-line-highlight" id="line-highlight-nv"></div>
+</div>
+</div>
+<div id="output-nv" class="cell-output">
+<div class="cell-stdout"><pre class="stdout-text">Fri Oct 31 20:13:34 2025       
++-----------------------------------------------------------------------------------------+
+| NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
+|-----------------------------------------+------------------------+----------------------+
+| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
+| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
+|                                         |                        |               MIG M. |
+|=========================================+========================+======================|
+|   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
+| N/A   43C    P0             83W /  350W |       0MiB /  46068MiB |     60%      Default |
+|                                         |                        |                  N/A |
++-----------------------------------------+------------------------+----------------------+
+
++-----------------------------------------------------------------------------------------+
+| Processes:                                                                              |
+|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
+|        ID   ID                                                               Usage      |
+|=========================================================================================|
+|  No running processes found                                                             |
++-----------------------------------------------------------------------------------------+
+
+</pre></div>
+</div>
+</div>
+
+<h2>Deformable DETR Multi-Scale Deformable Attention Benchmark</h2>
+<div class="cell" id="cell-benchmark">
+<div class="cell-header">
+<span class="collapse-indicators">
+<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span> 
+<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
+ <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
+</span> | 
+Cell: benchmark | 8.30s
+ | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
+<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
+<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
+<a href="https://github.com/huggingface/kernels-benchmarks/blob/main/deformable_detr/impls/hf_kernels_deformable_detr.md" target="_blank" class="github-btn">GitHub</a>
+<a href="https://huggingface.co/kernels-community/deformable-detr" target="_blank" class="hf-btn">🤗 HF</a>
+</div>
+<div id="code-benchmark" class="cell-code" data-lines="42">
+<div class="code-wrap">
+<div class="highlight"><pre><span></span><span class="c1"># /// script</span>
+<span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
+<span class="c1"># dependencies = [</span>
+<span class="c1">#     &quot;numpy&quot;,</span>
+<span class="c1">#     &quot;torch==2.8.0&quot;,</span>
+<span class="c1">#     &quot;kernels-benchmark-tools&quot;,</span>
+<span class="c1">#     &quot;kernels&quot;,</span>
+<span class="c1"># ]</span>
+<span class="c1">#</span>
+<span class="c1"># [tool.uv.sources]</span>
+<span class="c1"># kernels-benchmark-tools = { path = &quot;../../../../../tools&quot;, editable = true }</span>
+<span class="c1"># ///</span>
+<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
+<span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
+<span class="kn">from</span><span class="w"> </span><span class="nn">kernels_benchmark_tools</span><span class="w"> </span><span class="kn">import</span> <span class="n">KernelTypeEnum</span><span class="p">,</span> <span class="n">run_benchmark</span>
+<span class="kn">from</span><span class="w"> </span><span class="nn">kernels</span><span class="w"> </span><span class="kn">import</span> <span class="n">get_kernel</span>
+
+<span class="c1"># Load the deformable DETR kernel</span>
+<span class="n">deformable_detr</span> <span class="o">=</span> <span class="n">get_kernel</span><span class="p">(</span><span class="s2">&quot;kernels-community/deformable-detr&quot;</span><span class="p">)</span>
+
+
+<span class="k">def</span><span class="w"> </span><span class="nf">hf_kernels_deformable_detr</span><span class="p">(</span>
+    <span class="n">value</span><span class="p">,</span> <span class="n">spatial_shapes</span><span class="p">,</span> <span class="n">level_start_index</span><span class="p">,</span> <span class="n">sampling_locations</span><span class="p">,</span> <span class="n">attention_weights</span><span class="p">,</span> <span class="n">im2col_step</span><span class="o">=</span><span class="mi">64</span>
+<span class="p">):</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;HuggingFace Kernels Deformable DETR Multi-Scale Deformable Attention&quot;&quot;&quot;</span>
+    <span class="k">return</span> <span class="n">deformable_detr</span><span class="o">.</span><span class="n">ms_deform_attn_forward</span><span class="p">(</span>
+        <span class="n">value</span><span class="o">=</span><span class="n">value</span><span class="p">,</span>
+        <span class="n">spatial_shapes</span><span class="o">=</span><span class="n">spatial_shapes</span><span class="p">,</span>
+        <span class="n">level_start_index</span><span class="o">=</span><span class="n">level_start_index</span><span class="p">,</span>
+        <span class="n">sampling_loc</span><span class="o">=</span><span class="n">sampling_locations</span><span class="p">,</span>
+        <span class="n">attn_weight</span><span class="o">=</span><span class="n">attention_weights</span><span class="p">,</span>
+        <span class="n">im2col_step</span><span class="o">=</span><span class="n">im2col_step</span>
+    <span class="p">)</span>
+
+
+<span class="n">run_benchmark</span><span class="p">(</span>
+    <span class="n">kernel_type</span><span class="o">=</span><span class="n">KernelTypeEnum</span><span class="o">.</span><span class="n">DEFORMABLE_DETR</span><span class="p">,</span>
+    <span class="n">impl_name</span><span class="o">=</span><span class="s2">&quot;hf_kernels_deformable_detr&quot;</span><span class="p">,</span>
+    <span class="n">impl_tags</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;family&quot;</span><span class="p">:</span> <span class="s2">&quot;hf-kernels&quot;</span><span class="p">,</span> <span class="s2">&quot;backend&quot;</span><span class="p">:</span> <span class="s2">&quot;cuda&quot;</span><span class="p">},</span>
+    <span class="n">impl_func</span><span class="o">=</span><span class="n">hf_kernels_deformable_detr</span><span class="p">,</span>
+    <span class="n">dtype</span><span class="o">=</span><span class="s2">&quot;float32&quot;</span><span class="p">,</span>
+<span class="p">)</span>
+</pre></div>
+
+<div class="code-line-highlight" id="line-highlight-benchmark"></div>
+</div>
+</div>
+<div id="output-benchmark" class="cell-output">
+<div class="cell-stdout"><pre class="stdout-text">Running deformable_detr benchmark on cuda with 4 workloads.
+
+======================================================================
+PROFILE TRACE: hf_kernels_deformable_detr | cuda_B1_Q100_H8_E256_L4_P4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                             hf_kernels_deformable_detr         0.00%       0.000us         0.00%       0.000us       0.000us     195.201us       770.15%     195.201us     195.201us             1  
+                             hf_kernels_deformable_detr         7.43%     141.524us        99.61%       1.898ms       1.898ms       0.000us         0.00%      26.403us      26.403us             1  
+       _deformable_detr_57c3d32::ms_deform_attn_forward         3.93%      74.960us        92.19%       1.756ms     585.455us      22.464us        88.63%      26.403us       8.801us             3  
+void ms_deformable_im2col_gpu_kernel&lt;float&gt;(int, flo...         0.00%       0.000us         0.00%       0.000us       0.000us      22.464us        88.63%      22.464us       7.488us             3  
+                                            aten::zeros         1.20%      22.800us        85.08%       1.621ms     540.337us       0.000us         0.00%       3.939us       1.313us             3  
+                                            aten::zero_         0.89%      16.910us        82.13%       1.565ms     521.590us       0.000us         0.00%       3.939us       1.313us             3  
+                                            aten::fill_         1.72%      32.820us        81.24%       1.548ms     515.953us       2.882us        11.37%       3.939us       1.313us             3  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.882us        11.37%       2.882us       0.961us             3  
+                                Activity Buffer Request        77.24%       1.472ms        77.24%       1.472ms       1.472ms       1.057us         4.17%       1.057us       1.057us             1  
+                                            aten::empty         1.76%      33.441us         1.76%      33.441us      11.147us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         3.19%      60.842us         3.19%      60.842us      10.140us       0.000us         0.00%       0.000us       0.000us             6  
+                                             aten::view         0.89%      16.922us         0.89%      16.922us       2.820us       0.000us         0.00%       0.000us       0.000us             6  
+                                           aten::select         1.13%      21.591us         1.37%      26.081us       8.694us       0.000us         0.00%       0.000us       0.000us             3  
+                                       aten::as_strided         0.24%       4.490us         0.24%       4.490us       1.497us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.39%       7.340us         0.39%       7.340us       7.340us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.905ms
+Self CUDA time total: 25.346us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_deformable_detr | cuda_B1_Q300_H8_E256_L4_P4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                             hf_kernels_deformable_detr         0.00%       0.000us         0.00%       0.000us       0.000us     144.191us       546.22%     144.191us     144.191us             1  
+                             hf_kernels_deformable_detr         4.39%      75.912us        99.67%       1.722ms       1.722ms       0.000us         0.00%      27.358us      27.358us             1  
+       _deformable_detr_57c3d32::ms_deform_attn_forward         2.01%      34.700us        95.28%       1.646ms     548.647us      23.550us        89.21%      27.358us       9.119us             3  
+void ms_deformable_im2col_gpu_kernel&lt;float&gt;(int, flo...         0.00%       0.000us         0.00%       0.000us       0.000us      23.550us        89.21%      23.550us       7.850us             3  
+                                            aten::zeros         0.49%       8.451us        91.07%       1.573ms     524.424us       0.000us         0.00%       3.808us       1.269us             3  
+                                            aten::zero_         0.50%       8.669us        89.54%       1.547ms     515.616us       0.000us         0.00%       3.808us       1.269us             3  
+                                            aten::fill_         1.60%      27.701us        89.04%       1.538ms     512.727us       2.848us        10.79%       3.808us       1.269us             3  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.848us        10.79%       2.848us       0.949us             3  
+                                Activity Buffer Request        85.90%       1.484ms        85.90%       1.484ms       1.484ms       0.960us         3.64%       0.960us       0.960us             1  
+                                            aten::empty         1.04%      17.971us         1.04%      17.971us       5.990us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         2.40%      41.442us         2.40%      41.442us       6.907us       0.000us         0.00%       0.000us       0.000us             6  
+                                             aten::view         0.54%       9.400us         0.54%       9.400us       1.567us       0.000us         0.00%       0.000us       0.000us             6  
+                                           aten::select         0.66%      11.329us         0.79%      13.720us       4.573us       0.000us         0.00%       0.000us       0.000us             3  
+                                       aten::as_strided         0.14%       2.391us         0.14%       2.391us       0.797us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.33%       5.680us         0.33%       5.680us       5.680us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.728ms
+Self CUDA time total: 26.398us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_deformable_detr | cuda_B2_Q100_H8_E256_L4_P4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                             hf_kernels_deformable_detr         0.00%       0.000us         0.00%       0.000us       0.000us     140.288us       549.37%     140.288us     140.288us             1  
+                             hf_kernels_deformable_detr         4.34%      74.492us        99.67%       1.709ms       1.709ms       0.000us         0.00%      26.464us      26.464us             1  
+       _deformable_detr_57c3d32::ms_deform_attn_forward         1.96%      33.680us        95.32%       1.635ms     544.984us      22.752us        89.10%      26.464us       8.821us             3  
+void ms_deformable_im2col_gpu_kernel&lt;float&gt;(int, flo...         0.00%       0.000us         0.00%       0.000us       0.000us      22.752us        89.10%      22.752us       7.584us             3  
+                                            aten::zeros         0.50%       8.650us        91.19%       1.564ms     521.367us       0.000us         0.00%       3.712us       1.237us             3  
+                                            aten::zero_         0.47%       8.130us        89.69%       1.538ms     512.773us       0.000us         0.00%       3.712us       1.237us             3  
+                                            aten::fill_         1.63%      27.881us        89.21%       1.530ms     510.063us       2.784us        10.90%       3.712us       1.237us             3  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.784us        10.90%       2.784us       0.928us             3  
+                                Activity Buffer Request        86.04%       1.476ms        86.04%       1.476ms       1.476ms       0.928us         3.63%       0.928us       0.928us             1  
+                                            aten::empty         1.00%      17.131us         1.00%      17.131us       5.710us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         2.42%      41.510us         2.42%      41.510us       6.918us       0.000us         0.00%       0.000us       0.000us             6  
+                                             aten::view         0.52%       8.991us         0.52%       8.991us       1.498us       0.000us         0.00%       0.000us       0.000us             6  
+                                           aten::select         0.62%      10.681us         0.77%      13.291us       4.430us       0.000us         0.00%       0.000us       0.000us             3  
+                                       aten::as_strided         0.15%       2.610us         0.15%       2.610us       0.870us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.33%       5.730us         0.33%       5.730us       5.730us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.715ms
+Self CUDA time total: 25.536us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_deformable_detr | cuda_B2_Q300_H8_E256_L4_P4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                             hf_kernels_deformable_detr         0.00%       0.000us         0.00%       0.000us       0.000us     151.934us       322.76%     151.934us     151.934us             1  
+                             hf_kernels_deformable_detr         3.86%      74.313us        99.75%       1.919ms       1.919ms       0.000us         0.00%      48.129us      48.129us             1  
+       _deformable_detr_57c3d32::ms_deform_attn_forward         1.79%      34.420us        95.88%       1.844ms     614.769us      43.968us        93.40%      48.129us      16.043us             3  
+void ms_deformable_im2col_gpu_kernel&lt;float&gt;(int, flo...         0.00%       0.000us         0.00%       0.000us       0.000us      43.968us        93.40%      43.968us      14.656us             3  
+                                            aten::zeros         0.45%       8.600us        92.03%       1.770ms     590.092us       0.000us         0.00%       4.161us       1.387us             3  
+                                            aten::zero_         0.45%       8.690us        90.72%       1.745ms     581.642us       0.000us         0.00%       4.161us       1.387us             3  
+                                            aten::fill_         1.44%      27.641us        90.26%       1.736ms     578.745us       3.105us         6.60%       4.161us       1.387us             3  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       3.105us         6.60%       3.105us       1.035us             3  
+                                Activity Buffer Request        76.84%       1.478ms        76.84%       1.478ms       1.478ms       1.056us         2.24%       1.056us       1.056us             1  
+                                            aten::empty         0.87%      16.750us         0.87%      16.750us       5.583us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        12.74%     245.037us        12.74%     245.037us      40.839us       0.000us         0.00%       0.000us       0.000us             6  
+                                             aten::view         0.49%       9.420us         0.49%       9.420us       1.570us       0.000us         0.00%       0.000us       0.000us             6  
+                                           aten::select         0.66%      12.781us         0.82%      15.781us       5.260us       0.000us         0.00%       0.000us       0.000us             3  
+                                       aten::as_strided         0.16%       3.000us         0.16%       3.000us       1.000us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.25%       4.890us         0.25%       4.890us       4.890us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.924ms
+Self CUDA time total: 47.073us
+
+
+impl                     wl                  p50(ms)  ok
+hf_kernels_deformable_detr cuda_B1_Q100_H8_E256_L4_P4     0.04  True
+hf_kernels_deformable_detr cuda_B1_Q300_H8_E256_L4_P4     0.05  True
+hf_kernels_deformable_detr cuda_B2_Q100_H8_E256_L4_P4     0.05  True
+hf_kernels_deformable_detr cuda_B2_Q300_H8_E256_L4_P4     0.05  True
+</pre></div>
+<div class="uv-install-logs" id="uv-logs-benchmark">
+<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
+<div class="uv-logs-content" style="display: none;">
+Installed 52 packages in 237ms
+</div>
+</div>
+<div class="cell-stderr">Fetching 7 files:   0%|          | 0/7 [00:00&lt;?, ?it/s]
+Fetching 7 files:  14%|█▍        | 1/7 [00:00&lt;00:00,  6.20it/s]
+Fetching 7 files:  71%|███████▏  | 5/7 [00:00&lt;00:00,  9.26it/s]
+Fetching 7 files: 100%|██████████| 7/7 [00:00&lt;00:00, 12.59it/s]</div>
+<div class="cell-artifacts">
+<h4>Artifacts:</h4>
+<a href="artifacts/benchmark/deformable_detr.jsonl" class="artifact" target="_blank">deformable_detr.jsonl</a>
+</div>
+</div>
+</div>
+    </div>
+    
+</body>
+</html>
\ No newline at end of file
diff --git a/deformable_detr/impls/index.html b/deformable_detr/impls/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..507f4753d9c1efbdcbe259d5a8105e4524b0527f
--- /dev/null
+++ b/deformable_detr/impls/index.html
@@ -0,0 +1,89 @@
+<!DOCTYPE html>
+<html>
+<head>
+  <meta charset='UTF-8'>
+  <meta name='viewport' content='width=device-width, initial-scale=1.0'>
+  <title>Index of /deformable_detr/impls</title>
+  <style>
+    :root {
+      --bg-primary: #0a0a0a;
+      --bg-secondary: #121212;
+      --bg-tertiary: #181818;
+      --text-primary: #e0e0e0;
+      --text-secondary: #888888;
+      --text-link: #64b5f6;
+      --border-primary: #2a2a2a;
+    }
+    body {
+      font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
+      background: var(--bg-primary);
+      color: var(--text-primary);
+      margin: 0;
+      padding: 16px;
+      max-width: 900px;
+      margin: 0 auto;
+    }
+    .controls {
+      display: flex;
+      justify-content: flex-end;
+      margin-bottom: 1rem;
+    }
+    .back-button {
+      background: var(--bg-secondary);
+      border: 1px solid var(--border-primary);
+      padding: 8px 12px;
+      border-radius: 4px;
+      color: var(--text-secondary);
+      cursor: pointer;
+      font-size: 0.9rem;
+      text-decoration: none;
+      display: inline-block;
+    }
+    .back-button:hover {
+      color: var(--text-primary);
+      background: var(--bg-tertiary);
+    }
+    h1 {
+      font-size: 1.5em;
+      margin: 1rem 0;
+      color: var(--text-primary);
+      border-bottom: 1px solid var(--border-primary);
+      padding-bottom: 0.5rem;
+    }
+    ul {
+      list-style-type: none;
+      padding: 0;
+    }
+    li {
+      margin: 0;
+      border-bottom: 1px solid var(--border-primary);
+    }
+    li:last-child {
+      border-bottom: none;
+    }
+    a {
+      display: block;
+      padding: 0.75rem 0.5rem;
+      text-decoration: none;
+      color: var(--text-link);
+      transition: background 0.2s ease;
+    }
+    a:hover {
+      background: var(--bg-secondary);
+    }
+    .dir {
+      font-weight: 500;
+    }
+  </style>
+</head>
+<body>
+  <div class='controls'>
+    <a href='../index.html' class='back-button'>← back</a>
+  </div>
+  <h1>Index of /deformable_detr/impls</h1>
+  <ul>
+    <li><a href='hf_kernels_deformable_detr.html' class='file'>hf_kernels_deformable_detr.html</a></li>
+    <li><a href='torch_deformable_detr.html' class='file'>torch_deformable_detr.html</a></li>
+  </ul>
+</body>
+</html>
\ No newline at end of file
diff --git a/deformable_detr/impls/torch_deformable_detr.html b/deformable_detr/impls/torch_deformable_detr.html
new file mode 100644
index 0000000000000000000000000000000000000000..1d330b066f83130623802310ab8c5a5ceec69b71
--- /dev/null
+++ b/deformable_detr/impls/torch_deformable_detr.html
@@ -0,0 +1,4434 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>torch_deformable_detr</title>
+
+    <link rel="preconnect" href="https://fonts.googleapis.com">
+    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+    <link href="https://fonts.googleapis.com/css2?family=JetBrains+Mono:ital,wght@0,100..800;1,100..800&display=swap" rel="stylesheet">
+
+    <script>
+// Iframe-friendly navigation router
+        (function() {
+            const isIframe = window.self !== window.top;
+            if (!isIframe) return; // Only activate in iframe context
+
+            // On load: if hash points to a different page, navigate there
+            const hash = window.location.hash;
+            if (hash && hash.startsWith('#/')) {
+                const targetPath = hash.slice(2); // Remove '#/'
+                const currentPath = window.location.pathname.split('/').pop();
+
+                // Only navigate if we're not already on the target page
+                if (targetPath !== currentPath) {
+                    window.location.href = targetPath;
+                    return; // Stop execution, we're navigating away
+                }
+            }
+
+            // Intercept all link clicks for hash-based navigation
+            document.addEventListener('click', function(e) {
+                const link = e.target.closest('a');
+                if (!link) return;
+
+                const href = link.getAttribute('href');
+
+                // Skip external links, anchors, and javascript: links
+                if (!href || href.startsWith('#') || href.startsWith('http') || href.startsWith('javascript:')) {
+                    return;
+                }
+
+                e.preventDefault();
+
+                // Convert relative/absolute path to hash-based navigation
+                const url = new URL(href, window.location.href);
+                let fullPath = url.pathname;
+
+                // Remove leading slash if present for cleaner paths
+                if (fullPath.startsWith('/')) {
+                    fullPath = fullPath.slice(1);
+                }
+
+                // Update parent URL hash
+                window.location.hash = '#/' + fullPath;
+
+                // For HTML files, navigate within iframe
+                if (fullPath.endsWith('.html') || fullPath.endsWith('/')) {
+                    const pathParts = fullPath.split('/').filter(p => p);
+                    const targetFile = pathParts[pathParts.length - 1] || 'index.html';
+                    window.location.href = targetFile;
+                } else {
+                    // For non-HTML files (raw .py, etc), open directly
+                    window.open(href, '_blank');
+                }
+            });
+        })();
+
+        // Apply theme and widget visibility immediately to prevent flicker
+        (function() {
+            const configTheme = 'dark';
+            const hasConfigUi = false;
+            const configUi = hasConfigUi ? 'None' : null;
+            const hasWidgetsConfig = false;
+            const widgetsOn = hasWidgetsConfig ? false : true;
+            let theme;
+            if (configTheme === 'auto') {
+                theme = window.matchMedia('(prefers-color-scheme: dark)').matches ? 'dark' : 'light';
+            } else {
+                theme = localStorage.getItem('uvnote-theme') || configTheme;
+            }
+            document.documentElement.setAttribute('data-theme', theme);
+
+            // Initialize UI theme (css theme)
+            let ui = hasConfigUi ? configUi : (localStorage.getItem('uvnote-ui') || 'default');
+            if (ui !== 'default' && ui !== 'none' && ui !== 'monocolor') { ui = 'default'; }
+            document.documentElement.setAttribute('data-ui', ui);
+
+            // Apply widgets visibility
+            document.documentElement.setAttribute('data-widgets', widgetsOn ? 'on' : 'off');
+        })();
+    </script>
+    <style>
+:root[data-theme="light"] {
+    --bg-primary: #ffffff;
+    --bg-secondary: #f6f8fa;
+    --bg-tertiary: #f8f9fa;
+    --bg-code: #f8f9fa;
+    --bg-error: #fdf2f2;
+    --bg-artifact: #e6f3ff;
+    --bg-artifact-hover: #d0e7ff;
+
+    --text-primary: #333;
+    --text-secondary: #656d76;
+    --text-error: #c53030;
+    --text-link: #0969da;
+
+    --border-primary: #e1e5e9;
+    --border-error: #e53e3e;
+    --border-cell-failed: #d73a49;
+
+    --shadow: rgba(0, 0, 0, 0.1);
+}
+
+:root[data-theme="dark"] {
+    --bg-primary: #0a0a0a;
+    --bg-secondary: #121212;
+    --bg-tertiary: #181818;
+    --bg-code: #0d0d0d;
+    --bg-error: #1a0f0f;
+    --bg-artifact: #151515;
+    --bg-artifact-hover: #1a1a1a;
+
+    --text-primary: #e0e0e0;
+    --text-secondary: #888888;
+    --text-error: #ff6b6b;
+    --text-link: #64b5f6;
+
+    --border-primary: #2a2a2a;
+    --border-error: #ff6b6b;
+    --border-cell-failed: #ff6b6b;
+
+    --shadow: rgba(255, 255, 255, 0.05);
+}
+
+/* Monocolor UI theme: black/white background, all text/borders single blue */
+:root[data-ui="monocolor"] {
+    --mono-color: #0a66ff;
+}
+
+:root[data-ui="monocolor"][data-theme="light"] {
+    --bg-primary: #ffffff;
+}
+
+:root[data-ui="monocolor"][data-theme="dark"] {
+    --bg-primary: #000000;
+}
+
+:root[data-ui="monocolor"] {
+    --bg-secondary: var(--bg-primary);
+    --bg-tertiary: var(--bg-primary);
+    --bg-code: var(--bg-primary);
+    --bg-error: var(--bg-primary);
+    --bg-artifact: var(--bg-primary);
+    --bg-artifact-hover: var(--bg-primary);
+
+    --text-primary: var(--mono-color);
+    --text-secondary: var(--mono-color);
+    --text-error: var(--mono-color);
+    --text-link: var(--mono-color);
+
+    --border-primary: var(--mono-color);
+    --border-error: var(--mono-color);
+    --border-cell-failed: var(--mono-color);
+
+    --shadow: none;
+}
+
+:root[data-ui="monocolor"] a {
+    color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .menu-button,
+:root[data-ui="monocolor"] .theme-toggle,
+:root[data-ui="monocolor"] .reset-toggle,
+:root[data-ui="monocolor"] .back-button {
+    background: var(--bg-primary);
+    color: var(--mono-color);
+    border-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .menu-button:hover,
+:root[data-ui="monocolor"] .theme-toggle:hover,
+:root[data-ui="monocolor"] .reset-toggle:hover,
+:root[data-ui="monocolor"] .back-button:hover {
+    background: var(--bg-primary);
+    color: var(--mono-color);
+    border-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .menu-dropdown {
+    background: var(--bg-primary);
+    border-color: var(--mono-color);
+    box-shadow: none;
+}
+
+:root[data-ui="monocolor"] .menu-item {
+    color: var(--mono-color);
+    border-bottom-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .system-info {
+    background: var(--bg-primary);
+    border-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .cell {
+    border-color: var(--mono-color);
+    background: var(--bg-primary);
+}
+
+:root[data-ui="monocolor"] .cell-header {
+    background: var(--bg-primary);
+    border-bottom-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .artifact {
+    background: var(--bg-primary);
+    border-color: var(--mono-color);
+    color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .artifact:hover {
+    background: var(--bg-primary);
+}
+
+:root[data-ui="monocolor"] .artifact-preview img,
+:root[data-ui="monocolor"] .artifact-preview svg {
+    border-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .status-widget {
+    background: var(--bg-primary);
+    border-color: var(--mono-color);
+    color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .minimap,
+:root[data-ui="monocolor"] .file-explorer,
+:root[data-ui="monocolor"] .tools-widget {
+    background: var(--bg-primary);
+    border-color: var(--mono-color);
+    color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .cell-code {
+    background: var(--bg-primary);
+    border-bottom-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .tools-title,
+:root[data-ui="monocolor"] .file-explorer-section-title,
+:root[data-ui="monocolor"] .minimap-title {
+    color: var(--mono-color);
+    border-bottom-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .tool-button {
+    background: var(--bg-primary);
+    border-color: var(--mono-color);
+    color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .tool-button.active {
+    border-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .file-explorer-item,
+:root[data-ui="monocolor"] .minimap-item {
+    color: var(--mono-color);
+}
+
+/* Force Pygments code to mono blue on mono bg */
+:root[data-ui="monocolor"] .highlight {
+    background: var(--bg-primary) !important;
+    color: var(--mono-color) !important;
+}
+
+:root[data-ui="monocolor"] .highlight *,
+:root[data-ui="monocolor"] .highlight .hll {
+    color: var(--mono-color) !important;
+    background: transparent !important;
+    border-color: var(--mono-color) !important;
+}
+
+/* Default code font + metrics (overridable via frontmatter) */
+:root {
+    --code-font-size: 0.95rem;
+    --code-line-height: 1.5;
+    --code-pad-y: 0.75rem;
+}
+
+/* Minimal UI theme overrides base variables for a flatter, 90s look */
+:root[data-ui="none"] {
+    --bg-primary: #ffffff;
+    --bg-secondary: transparent;
+    --bg-tertiary: transparent;
+    --bg-code: #f9f9f9;
+    --bg-error: #fff0f0;
+    --bg-artifact: #f0f7ff;
+    --bg-artifact-hover: #e5f1ff;
+
+    --text-primary: #000000;
+    --text-secondary: #222222;
+    --text-error: #a00000;
+    --text-link: #0000ee;
+
+    --border-primary: #cccccc;
+    --border-error: #cc0000;
+    --border-cell-failed: #cc0000;
+
+    --shadow: none;
+}
+
+html {
+    overscroll-behavior: none;
+}
+
+body {
+    font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace;
+    line-height: 1.4;
+    max-width: 1000px;
+    margin: 0 auto;
+    padding: 15px;
+    color: var(--text-primary);
+    background: var(--bg-primary);
+    transition: background-color 0.2s ease, color 0.2s ease;
+    overscroll-behavior: none;
+}
+
+/* Minimal "none" UI theme overrides */
+:root[data-ui="none"] body {
+    font-family: 'Times New Roman', Times, serif;
+    line-height: 1.5;
+    max-width: 860px;
+    padding: 12px;
+    background: #ffffff;
+    color: #000000;
+    transition: none;
+}
+
+/* Two panel layout removed */
+
+.controls {
+    position: fixed;
+    top: 20px;
+    right: 20px;
+    display: flex;
+    flex-direction: column;
+    align-items: flex-end;
+    gap: 0.25rem;
+    z-index: 1000;
+}
+
+.controls-buttons {
+    display: flex;
+    gap: 0.5rem;
+}
+
+.menu-button {
+    position: relative;
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    padding: 8px 12px;
+    border-radius: 2px;
+    color: var(--text-secondary);
+    cursor: pointer;
+    font-family: inherit;
+    font-size: 0.9rem;
+    user-select: none;
+}
+
+/* Keep default control styling when widgets are enabled, even in minimal UI */
+:root[data-ui="none"][data-widgets="on"] .menu-button,
+:root[data-ui="none"][data-widgets="on"] .theme-toggle,
+:root[data-ui="none"][data-widgets="on"] .reset-toggle,
+:root[data-ui="none"][data-widgets="on"] .back-button {
+    background: #f6f6f6;
+    border: 1px solid #cccccc;
+    color: #222222;
+}
+
+.menu-button:hover {
+    color: var(--text-primary);
+    background: var(--bg-tertiary);
+}
+
+/* Controls state indicator (top-right) */
+/* Status widget (bottom-right) */
+.status-widget {
+    position: fixed;
+    right: 20px;
+    bottom: 20px;
+    width: auto;
+    max-width: 260px;
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    border-radius: 2px;
+    padding: 6px 8px;
+    font-size: 0.8rem;
+    color: var(--text-secondary);
+    z-index: 100;
+}
+
+.status-widget strong {
+    color: var(--text-primary);
+}
+
+:root[data-ui="none"][data-widgets="on"] .status-widget {
+    background: #f6f6f6;
+    border-color: #ccc;
+    color: #222;
+}
+
+:root[data-ui="none"][data-widgets="on"] .menu-button:hover,
+:root[data-ui="none"][data-widgets="on"] .theme-toggle:hover,
+:root[data-ui="none"][data-widgets="on"] .reset-toggle:hover,
+:root[data-ui="none"][data-widgets="on"] .back-button:hover {
+    background: #ededed;
+    border-color: #bbbbbb;
+    color: #000000;
+}
+
+.menu-dropdown {
+    position: absolute;
+    top: 100%;
+    right: 0;
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    border-radius: 4px;
+    box-shadow: 0 4px 12px var(--shadow);
+    min-width: 160px;
+    opacity: 0;
+    visibility: hidden;
+    transform: translateY(-8px);
+    transition: all 0.2s ease;
+    z-index: 1001;
+    margin-top: 4px;
+}
+
+:root[data-ui="none"][data-widgets="on"] .menu-dropdown {
+    background: #ffffff;
+    border: 1px solid #cccccc;
+    box-shadow: none;
+}
+
+.menu-button.active .menu-dropdown {
+    opacity: 1;
+    visibility: visible;
+    transform: translateY(0);
+}
+
+.menu-item {
+    display: block;
+    padding: 8px 12px;
+    color: var(--text-secondary);
+    text-decoration: none;
+    font-size: 0.85rem;
+    border-bottom: 1px solid var(--border-primary);
+    cursor: pointer;
+}
+
+:root[data-ui="none"] .menu-item {
+    color: #000;
+    border-bottom: 1px solid #eee;
+}
+
+.menu-item:last-child {
+    border-bottom: none;
+}
+
+.menu-item:hover {
+    background: var(--bg-tertiary);
+    color: var(--text-primary);
+}
+
+.menu-checkbox {
+    display: inline-block;
+    width: 16px;
+    font-family: monospace;
+    color: var(--text-link);
+}
+
+.theme-toggle,
+.reset-toggle,
+.back-button {
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    padding: 8px 12px;
+    border-radius: 4px;
+    color: var(--text-secondary);
+    cursor: pointer;
+    font-family: inherit;
+    font-size: 0.9rem;
+    user-select: none;
+}
+
+.back-button {
+    text-decoration: none;
+    display: inline-block;
+}
+
+.theme-toggle:hover,
+.reset-toggle:hover,
+.back-button:hover {
+    color: var(--text-primary);
+    background: var(--bg-tertiary);
+}
+
+.system-info {
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    border-radius: 4px;
+    padding: 8px 12px;
+    margin-bottom: 16px;
+    font-size: 0.85em;
+    color: var(--text-secondary);
+}
+
+.system-info-header {
+    font-weight: 600;
+    color: var(--text-primary);
+    margin-bottom: 2px;
+}
+
+.system-info-content {
+    font-family: monospace;
+}
+
+.theme-toggle,
+.reset-toggle {
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    border-radius: 2px;
+    /* padding: 0.4rem 0.6rem; */
+    cursor: pointer;
+    font-family: inherit;
+    font-size: 0.8rem;
+    color: var(--text-secondary);
+    user-select: none;
+    transition: all 0.2s ease;
+    text-transform: lowercase;
+    letter-spacing: 0;
+}
+
+.theme-toggle:hover,
+.reset-toggle:hover {
+    background: var(--bg-tertiary);
+    border-color: var(--text-secondary);
+    color: var(--text-primary);
+}
+
+.minimap {
+    position: fixed;
+    bottom: 20px;
+    right: 20px;
+    width: 220px;
+    max-height: 400px;
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    border-radius: 2px;
+    padding: 0.5rem;
+    font-size: 0.7rem;
+    overflow-y: auto;
+    z-index: 100;
+    opacity: 0.9;
+    transition: opacity 0.2s ease;
+}
+
+/* Hide widgets and controls when disabled via frontmatter */
+:root[data-widgets="off"] .controls,
+:root[data-widgets="off"] .minimap,
+:root[data-widgets="off"] .file-explorer,
+:root[data-widgets="off"] .tools-widget,
+:root[data-widgets="off"] .status-widget {
+    display: none !important;
+}
+
+.file-explorer {
+    position: fixed;
+    bottom: 20px;
+    /* default; JS will stack */
+    right: 20px;
+    left: auto;
+    top: auto;
+    width: 220px;
+    max-height: 400px;
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    border-radius: 2px;
+    padding: 0.5rem;
+    font-size: 0.7rem;
+    overflow-y: auto;
+    z-index: 100;
+    opacity: 0.9;
+    transition: opacity 0.2s ease;
+}
+
+/* Drawing overlay */
+.draw-overlay {
+    position: fixed;
+    top: 0;
+    left: 0;
+    width: 100vw;
+    height: 100vh;
+    z-index: 80;
+    /* under widgets (100) and controls (1000) */
+    display: block;
+    pointer-events: none;
+    /* enabled only when a tool is active */
+}
+
+/* Tools widget */
+.tools-widget {
+    position: fixed;
+    bottom: 20px;
+    /* default; JS will stack */
+    right: 20px;
+    left: auto;
+    top: auto;
+    width: 220px;
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    border-radius: 2px;
+    padding: 0.5rem;
+    font-size: 0.7rem;
+    z-index: 100;
+    opacity: 0.95;
+}
+
+.tools-title {
+    font-weight: bold;
+    color: var(--text-secondary);
+    margin-bottom: 0.5rem;
+    padding-bottom: 0.25rem;
+    border-bottom: 1px solid var(--border-primary);
+    cursor: grab;
+    user-select: none;
+}
+
+.tools-row {
+    display: flex;
+    gap: 0.4rem;
+    flex-wrap: wrap;
+}
+
+.tool-button {
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-primary);
+    border-radius: 2px;
+    padding: 0.25rem 0.4rem;
+    cursor: pointer;
+    color: var(--text-secondary);
+    font-family: inherit;
+    font-size: 0.75rem;
+    user-select: none;
+}
+
+.tool-button:hover {
+    color: var(--text-primary);
+}
+
+.tool-button.active {
+    color: var(--text-primary);
+    border-color: var(--text-secondary);
+    background: var(--bg-secondary);
+}
+
+.minimap:hover,
+.file-explorer:hover {
+    opacity: 1;
+}
+
+.minimap-title {
+    font-weight: bold;
+    color: var(--text-secondary);
+    margin-bottom: 0.5rem;
+    padding-bottom: 0.25rem;
+    border-bottom: 1px solid var(--border-primary);
+    cursor: grab;
+    /* drag handle */
+    user-select: none;
+}
+
+.minimap-item {
+    display: block;
+    color: var(--text-secondary);
+    text-decoration: none;
+    padding: 0.15rem 0;
+    border-left: 2px solid transparent;
+    padding-left: 0.5rem;
+    transition: all 0.2s ease;
+    cursor: pointer;
+}
+
+.minimap-item:hover {
+    color: var(--text-primary);
+    border-left-color: var(--text-secondary);
+}
+
+.minimap-item.active {
+    color: var(--text-primary);
+    border-left-color: var(--text-link);
+}
+
+.minimap-heading {
+    font-weight: normal;
+}
+
+.minimap-heading.h1 {
+    padding-left: 0.5rem;
+}
+
+.minimap-heading.h2 {
+    padding-left: 1rem;
+}
+
+.minimap-heading.h3 {
+    padding-left: 1.5rem;
+}
+
+.minimap-heading.h4 {
+    padding-left: 2rem;
+}
+
+.minimap-heading.h5 {
+    padding-left: 2.5rem;
+}
+
+.minimap-heading.h6 {
+    padding-left: 3rem;
+}
+
+.minimap-cell {
+    color: var(--text-link);
+    opacity: 0.8;
+    font-style: italic;
+}
+
+.minimap-cell:hover {
+    opacity: 1;
+}
+
+.file-explorer-title {
+    font-weight: bold;
+    color: var(--text-secondary);
+    margin-bottom: 0.5rem;
+    padding-bottom: 0.25rem;
+    border-bottom: 1px solid var(--border-primary);
+    cursor: grab;
+    /* drag handle */
+    user-select: none;
+}
+
+.file-explorer-section {
+    margin-bottom: 0.75rem;
+}
+
+.file-explorer-section-title {
+    font-weight: bold;
+    color: var(--text-secondary);
+    font-size: 0.65rem;
+    margin-bottom: 0.25rem;
+    text-transform: uppercase;
+    letter-spacing: 0.5px;
+}
+
+.file-explorer-item {
+    display: block;
+    color: var(--text-secondary);
+    text-decoration: none;
+    padding: 0.1rem 0;
+    margin-left: 0.5rem;
+    transition: color 0.2s ease;
+    cursor: pointer;
+    font-family: monospace;
+}
+
+.file-explorer-item:hover {
+    color: var(--text-primary);
+}
+
+.file-explorer-item.script {
+    color: var(--text-link);
+}
+
+.file-explorer-item.artifact {
+    color: var(--text-secondary);
+    opacity: 0.8;
+}
+
+
+/* Hide widgets on smaller screens */
+@media (max-width: 768px) {
+
+    .minimap,
+    .file-explorer,
+    .tools-widget {
+        display: none;
+    }
+}
+
+.cell {
+    margin: 1rem 0;
+    border: 1px solid var(--border-primary);
+    border-radius: 2px;
+    overflow: hidden;
+    background: var(--bg-secondary);
+}
+
+:root[data-ui="none"] .cell {
+    margin: 1em 0;
+    border: none;
+    background: transparent;
+}
+
+.cell-header {
+    background: var(--bg-secondary);
+    padding: 0.5rem 1rem;
+    border-bottom: 1px solid var(--border-primary);
+    font-family: inherit;
+    font-size: 0.85rem;
+}
+
+:root[data-ui="none"] .cell-header {
+    background: transparent;
+    border: none;
+    padding: 0;
+    font-weight: bold;
+}
+
+:root[data-ui="none"] .cell-content {
+    padding: 0;
+}
+
+:root[data-ui="none"] .copy-button,
+:root[data-ui="none"] .collapse-indicators,
+:root[data-ui="none"] .cell-meta,
+:root[data-ui="none"] .cell-outputs-header {
+    display: none !important;
+}
+
+:root[data-ui="none"] pre,
+:root[data-ui="none"] code {
+    font-family: Menlo, Monaco, 'Courier New', monospace;
+}
+
+:root[data-ui="none"] .code-content pre {
+    background: #f9f9f9;
+    border: 1px solid #ddd;
+    padding: 8px;
+}
+
+:root[data-ui="none"] .output {
+    background: transparent;
+    border: none;
+    padding: 0.25em 0;
+}
+
+color: var(--text-secondary);
+cursor: pointer;
+user-select: none;
+transition: background-color 0.2s ease;
+}
+
+.cell-header:hover {
+    background: var(--bg-tertiary);
+}
+
+.collapse-indicators {
+    color: var(--text-secondary);
+    font-size: 0.8rem;
+    opacity: 0.7;
+}
+
+.collapse-indicators span:hover {
+    color: var(--text-primary);
+    opacity: 1;
+}
+
+.cell-code {
+    display: block;
+    background: var(--bg-code);
+}
+
+.cell-code.collapsed {
+    display: none;
+}
+
+.cell-code pre {
+    margin: 0;
+    padding: 0.75rem;
+    background: var(--bg-code);
+    overflow-x: auto;
+    color: var(--text-primary);
+}
+
+.cell-output {
+    padding: 0.75rem;
+    /* background: var(--bg-primary); */
+    background: var(--bg-secondary);
+}
+
+.cell-output.collapsed {
+    display: none;
+}
+
+.cell-stdout {
+    background: var(--bg-tertiary);
+    padding: 0.75rem;
+    border-radius: 1px;
+    /* margin: 0.25rem 0; */
+    font-family: inherit;
+    font-size: 0.9rem;
+    white-space: pre-wrap;
+    color: var(--text-primary);
+}
+
+.cell-stdout {
+    background: var(--bg-tertiary);
+    padding: 0.75rem;
+    border-radius: 1px;
+    font-family: inherit;
+    font-size: 0.9rem;
+    color: var(--text-primary);
+
+    /* key bits */
+    overflow: auto;
+    /* show scrollbars when needed */
+    max-width: 100%;
+    /* respects whatever layout width you give it */
+}
+
+.cell-stdout .stdout-text {
+    margin: 0;
+    /* reset pre default margin */
+    white-space: pre;
+    /* keep line breaks, NO wrapping */
+    display: inline-block;
+    /* shrink-to-content */
+    min-width: max-content;
+    /* allow very long lines to define intrinsic width */
+    font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
+    tab-size: 2;
+}
+
+.cell-stderr {
+    background: var(--bg-error);
+    border-left: 2px solid var(--border-error);
+    padding: 1rem;
+    margin: 0.5rem 0;
+    font-family: inherit;
+    font-size: 0.9rem;
+    color: var(--text-error);
+    white-space: pre-wrap;
+}
+
+.uv-install-logs {
+    margin: 0.5rem 0;
+}
+
+.uv-logs-header {
+    cursor: pointer;
+    padding: 0.75rem;
+    border-left: 3px solid var(--border-color);
+    font-family: inherit;
+    font-size: 0.85rem;
+    color: var(--text-secondary);
+    user-select: none;
+}
+
+.uv-logs-content {
+    background: var(--bg-secondary);
+    padding: 1rem;
+    border-left: 3px solid var(--border-color);
+    white-space: pre-wrap;
+    font-family: monospace;
+    font-size: 0.85rem;
+    color: var(--text-secondary);
+    overflow-x: auto;
+}
+
+.cell-artifacts {
+    margin: 1rem 0;
+}
+
+.cell-artifacts h4 {
+    margin: 0 0 0.5rem 0;
+    color: var(--text-secondary);
+    font-size: 0.9rem;
+}
+
+.artifact {
+    display: inline-block;
+    background: var(--bg-artifact);
+    padding: 0.25rem 0.5rem;
+    border-radius: 1px;
+    margin: 0.25rem 0.5rem 0.25rem 0;
+    font-family: inherit;
+    font-size: 0.8rem;
+    color: var(--text-link);
+    text-decoration: none;
+    transition: background-color 0.2s ease;
+    border: 1px solid var(--border-primary);
+}
+
+.artifact:hover {
+    background: var(--bg-artifact-hover);
+}
+
+.artifact-preview {
+    margin-top: 1rem;
+}
+
+.artifact-preview img {
+    max-width: 100%;
+    height: auto;
+    border: 1px solid var(--border-primary);
+    border-radius: 1px;
+}
+
+.artifact-preview svg {
+    max-width: 100%;
+    height: auto;
+    border: 1px solid var(--border-primary);
+    border-radius: 1px;
+    display: block;
+}
+
+/* Style SVG text elements */
+.artifact-preview svg g {
+    fill: var(--text-primary) !important;
+}
+
+/* Auto-theme SVG elements */
+.artifact-preview svg {
+    background: transparent;
+}
+
+/* Invert SVG images in dark mode */
+:root[data-theme="dark"] .artifact-preview img[src$=".svg"] {
+    filter: invert(0.9) hue-rotate(180deg);
+}
+
+/* Keep SVG images readable in monocolor mode */
+:root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] {
+    filter: none;
+}
+
+/* CSV table styling */
+.artifact-csv {
+    margin-top: 1rem;
+    overflow-x: auto;
+}
+
+.csv-table {
+    width: 100%;
+    border-collapse: collapse;
+    font-size: 0.9rem;
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    border-radius: 1px;
+}
+
+.csv-table th,
+.csv-table td {
+    padding: 0.5rem 0.75rem;
+    text-align: left;
+    border: 1px solid var(--border-primary);
+}
+
+.csv-table th {
+    background: var(--bg-tertiary);
+    font-weight: 600;
+    color: var(--text-primary);
+}
+
+.csv-table tbody tr:hover {
+    background: var(--bg-artifact-hover);
+}
+
+.artifact-csv-error {
+    margin-top: 1rem;
+    padding: 1rem;
+    background: var(--bg-error);
+    color: var(--text-error);
+    border: 1px solid var(--border-error);
+    border-radius: 1px;
+}
+
+.cell-failed {
+    border-color: var(--border-cell-failed);
+}
+
+.cell-failed .cell-header {
+    background: var(--bg-error);
+    color: var(--text-error);
+}
+
+.cell-commented {
+    opacity: 0.6;
+    border-style: dashed;
+}
+
+.cell-commented .cell-header {
+    background: var(--bg-secondary);
+    color: var(--text-secondary);
+    font-style: italic;
+}
+
+.run-btn {
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-primary);
+    padding: 2px 6px;
+    border-radius: 2px;
+    color: var(--text-secondary);
+    cursor: pointer;
+    font-size: 0.75em;
+    font-family: inherit;
+    margin-left: 4px;
+}
+
+.run-btn:hover {
+    color: var(--text-primary);
+    background: var(--bg-primary);
+}
+
+.run-btn:disabled {
+    opacity: 0.6;
+    cursor: not-allowed;
+}
+
+.copy-btn {
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-primary);
+    padding: 2px 6px;
+    border-radius: 2px;
+    color: var(--text-secondary);
+    cursor: pointer;
+    font-size: 0.75em;
+    font-family: inherit;
+    margin-left: 4px;
+}
+
+.copy-btn:hover {
+    color: var(--text-primary);
+    background: var(--bg-primary);
+}
+
+.copy-btn:disabled {
+    opacity: 0.6;
+    cursor: not-allowed;
+}
+
+.copy-btn.copied {
+    color: #4caf50;
+    background: var(--bg-primary);
+    border-color: #4caf50;
+    transition: all 0.2s ease;
+}
+
+.raw-btn {
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-primary);
+    padding: 2px 6px;
+    border-radius: 2px;
+    color: var(--text-secondary);
+    cursor: pointer;
+    font-size: 0.75em;
+    font-family: inherit;
+    margin-left: 4px;
+    text-decoration: none;
+    display: inline-block;
+}
+
+.raw-btn:hover {
+    color: var(--text-primary);
+    background: var(--bg-primary);
+    text-decoration: none;
+}
+
+.github-btn {
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-primary);
+    padding: 2px 6px;
+    border-radius: 2px;
+    color: var(--text-secondary);
+    cursor: pointer;
+    font-size: 0.75em;
+    font-family: inherit;
+    margin-left: 4px;
+    text-decoration: none;
+    display: inline-block;
+}
+
+.github-btn:hover {
+    color: var(--text-primary);
+    background: var(--bg-primary);
+    text-decoration: none;
+}
+
+.hf-btn {
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-primary);
+    padding: 2px 6px;
+    border-radius: 2px;
+    color: var(--text-secondary);
+    cursor: pointer;
+    font-size: 0.75em;
+    font-family: inherit;
+    margin-left: 4px;
+    text-decoration: none;
+    display: inline-block;
+}
+
+.hf-btn:hover {
+    color: var(--text-primary);
+    background: var(--bg-primary);
+    text-decoration: none;
+}
+
+.output-stale {
+    opacity: 0.5;
+    position: relative;
+}
+
+.output-stale::after {
+    content: '⏳ updating...';
+    position: absolute;
+    top: 8px;
+    right: 8px;
+    background: var(--bg-secondary);
+    padding: 4px 8px;
+    border-radius: 2px;
+    font-size: 0.75em;
+    color: var(--text-secondary);
+    border: 1px solid var(--border-primary);
+}
+
+h1,
+h2,
+h3,
+h4,
+h5,
+h6 {
+    margin-top: 1.5rem;
+    margin-bottom: 0.75rem;
+    color: var(--text-primary);
+}
+
+h1 {
+    margin-top: 0;
+    margin-bottom: 1rem;
+}
+
+p {
+    margin: 0.75rem 0;
+    color: var(--text-primary);
+}
+
+a {
+    color: var(--text-link);
+}
+
+img {
+    max-width: 100%;
+    height: auto;
+    border-radius: 1px;
+    box-shadow: none;
+}
+
+pre,
+code {
+    font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace;
+    font-size: var(--code-font-size);
+}
+
+.code-wrap {
+    position: relative;
+}
+
+.code-line-highlight {
+    display: none;
+    position: absolute;
+    left: 0;
+    right: 0;
+    height: 1.5em;
+    background: rgba(255, 235, 170, 0.35);
+    pointer-events: none;
+    border-left: 3px solid #f4c542;
+}
+
+.line-number {
+    cursor: pointer;
+    text-decoration: none;
+    color: var(--text-secondary);
+    padding: 0 0.25rem;
+}
+
+.line-number.selected {
+    background: rgba(255, 235, 170, 0.4);
+    color: var(--text-primary);
+}
+
+/* Line numbers */
+.highlight-with-lines {
+    display: flex;
+}
+
+.line-numbers {
+    background: var(--bg-tertiary);
+    padding: var(--code-pad-y) 0.5rem;
+    font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace;
+    font-size: var(--code-font-size);
+    line-height: var(--code-line-height);
+    color: var(--text-secondary);
+    user-select: none;
+    text-align: right;
+    border-right: 1px solid var(--border-primary);
+}
+
+.line-numbers .line-number {
+    display: block;
+    line-height: var(--code-line-height);
+}
+
+.highlight-with-lines .highlight {
+    flex: 1;
+}
+
+.highlight .hll {
+    background-color: transparent;
+}
+
+/* don't conflict with our highlight */
+.highlight pre {
+    white-space: pre;
+    margin: 0;
+    padding: var(--code-pad-y) 0.75rem;
+    line-height: var(--code-line-height);
+}
+
+/* Collapsed code styling */
+.cell-code.collapsed {
+    display: none;
+}
+
+.cell-code.expanded {
+    display: block;
+}
+
+    {
+    % if config.collapse_code %
+}
+
+.cell-code {
+    display: none;
+}
+
+    {
+    % else %
+}
+
+.cell-code {
+    display: block;
+    border-bottom: 1px solid var(--border-primary);
+}
+
+    {
+    % endif %
+}
+
+    {
+        {
+        pygments_css
+    }
+}
+
+/* Ensure our code metrics override Pygments defaults */
+.highlight pre {
+    white-space: pre;
+    margin: 0;
+    padding: var(--code-pad-y) 0.75rem !important;
+    line-height: var(--code-line-height) !important;
+    font-size: var(--code-font-size) !important;
+    font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace !important;
+    border: none;
+}
+
+.line-numbers {
+    line-height: var(--code-line-height) !important;
+}
+
+.line-numbers .line-number {
+    line-height: var(--code-line-height) !important;
+}
+
+/* Custom CSS from frontmatter */
+    {
+        {
+        config.custom_css
+    }
+}
+
+    {
+    # Override code font size from frontmatter (accept number as px) #
+}
+
+    {
+    % if config.code_font_size is not none %
+}
+
+    {
+    % if config.code_font_size is string %
+}
+
+:root {
+    --code-font-size: {
+            {
+            config.code_font_size
+        }
+    }
+
+    ;
+}
+
+    {
+    % else %
+}
+
+:root {
+    --code-font-size: {
+            {
+            config.code_font_size
+        }
+    }
+
+    px;
+}
+
+    {
+    % endif %
+}
+
+    {
+    % endif %
+}
+
+/* Cursor for tools */
+body[data-tool="arrow"] .main-content {
+    cursor: url('data:image/svg+xml;utf8,<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="%23e53935" stroke-width="2"><path d="M12 19l7-7 3 3-7 7-3-3z"/><path d="M18 13l-1.5-7.5L2 2l3.5 14.5L13 18l5-5z"/><path d="M2 2l7.586 7.586"/><circle cx="11" cy="11" r="2"/></svg>') 12 12, crosshair;
+}
+
+body[data-tool="pen"] .main-content {
+    cursor: url('data:image/svg+xml;utf8,<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="%23e53935" stroke-width="2"><path d="M12 19l7-7 3 3-7 7-3-3z"/><path d="M18 13l-1.5-7.5L2 2l3.5 14.5L13 18l5-5z"/><circle cx="4" cy="20" r="2" fill="%23e53935"/></svg>') 4 20, pointer;
+}
+
+body[data-tool="eraser"] .main-content {
+    cursor: url('data:image/svg+xml;utf8,<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="%23e53935" stroke-width="2"><path d="M20 20H7l-7-7 7-7h13v14z"/><path d="M13 13l7-7"/><path d="M13 13L9 9"/></svg>') 12 12, auto;
+}
+
+/* Color picker styles */
+.tools-section-title {
+    font-weight: bold;
+    color: var(--text-secondary);
+    font-size: 0.65rem;
+    margin: 0.75rem 0 0.5rem 0;
+    text-transform: uppercase;
+    letter-spacing: 0.5px;
+}
+
+.color-row {
+    display: grid;
+    grid-template-columns: repeat(6, 1fr);
+    gap: 0.25rem;
+    margin-bottom: 0.5rem;
+}
+
+.color-swatch {
+    width: 18px;
+    height: 18px;
+    border: 2px solid var(--border-primary);
+    border-radius: 3px;
+    cursor: pointer;
+    transition: all 0.2s ease;
+    position: relative;
+}
+
+.color-swatch:hover {
+    transform: scale(1.1);
+    border-color: var(--text-secondary);
+}
+
+.color-swatch.selected {
+    border-color: var(--text-primary);
+    box-shadow: 0 0 0 2px var(--text-link);
+}
+
+.color-swatch.selected::after {
+    content: '✓';
+    position: absolute;
+    top: 50%;
+    left: 50%;
+    transform: translate(-50%, -50%);
+    color: white;
+    font-size: 10px;
+    font-weight: bold;
+    text-shadow: 1px 1px 1px black;
+}
+
+.color-input {
+    width: 24px;
+    height: 24px;
+    border: 2px solid var(--border-primary);
+    border-radius: 3px;
+    cursor: pointer;
+    background: none;
+    padding: 0;
+    grid-column: span 2;
+    justify-self: center;
+}
+
+.color-input:hover {
+    border-color: var(--text-secondary);
+}
+
+/* Thickness slider styles */
+.thickness-row {
+    display: flex;
+    align-items: center;
+    gap: 0.5rem;
+    margin-top: 0.75rem;
+}
+
+.thickness-slider {
+    flex: 1;
+    -webkit-appearance: none;
+    appearance: none;
+    height: 4px;
+    background: var(--border-primary);
+    border-radius: 2px;
+    outline: none;
+    opacity: 0.7;
+    transition: opacity 0.2s;
+}
+
+.thickness-slider:hover {
+    opacity: 1;
+}
+
+.thickness-slider::-webkit-slider-thumb {
+    -webkit-appearance: none;
+    appearance: none;
+    width: 12px;
+    height: 12px;
+    background: var(--text-link);
+    border-radius: 50%;
+    cursor: pointer;
+}
+
+.thickness-slider::-moz-range-thumb {
+    width: 12px;
+    height: 12px;
+    background: var(--text-link);
+    border-radius: 50%;
+    cursor: pointer;
+    border: none;
+}
+
+.thickness-value {
+    font-size: 0.7rem;
+    color: var(--text-secondary);
+    min-width: 20px;
+    text-align: right;
+}
+
+.highlight {
+    background: none !important;
+}
+
+/* Loading animations */
+.loading-spinner {
+    display: inline-block;
+    width: 16px;
+    height: 16px;
+    border: 2px solid var(--border-primary);
+    border-radius: 50%;
+    border-top-color: var(--text-link);
+    animation: spin 1s linear infinite;
+    margin-right: 8px;
+    vertical-align: middle;
+}
+
+@keyframes spin {
+    to {
+        transform: rotate(360deg);
+    }
+}
+
+.loading-skeleton {
+    display: inline-block;
+    background: var(--bg-tertiary);
+    background: linear-gradient(90deg,
+            var(--bg-tertiary) 25%,
+            var(--bg-secondary) 50%,
+            var(--bg-tertiary) 75%);
+    background-size: 200% 100%;
+    animation: loading-shimmer 2s ease-in-out infinite;
+    border-radius: 2px;
+    height: 1em;
+    width: 80px;
+    vertical-align: middle;
+}
+
+@keyframes loading-shimmer {
+    0% {
+        background-position: -200% 0;
+    }
+
+    100% {
+        background-position: 200% 0;
+    }
+}
+
+/* Loading state for cell output */
+.cell-output:has(.loading-spinner) {
+    opacity: 0.7;
+    background: var(--bg-secondary);
+    /* border-left: 3px solid var(--text-link); */
+}
+    </style>
+    <script>
+// --- Drag utilities ---
+        function clamp(val, min, max) { return Math.max(min, Math.min(max, val)); }
+
+        function restorePosition(el, storageKey) {
+            try {
+                const raw = localStorage.getItem(storageKey);
+                if (!raw) return;
+                const pos = JSON.parse(raw);
+                if (typeof pos.left === 'number' && typeof pos.top === 'number') {
+                    el.style.left = pos.left + 'px';
+                    el.style.top = pos.top + 'px';
+                    el.style.right = 'auto';
+                    el.style.bottom = 'auto';
+                }
+            } catch (_) {}
+        }
+
+        function savePosition(el, storageKey) {
+            try {
+                const left = parseFloat(el.style.left || 'NaN');
+                const top = parseFloat(el.style.top || 'NaN');
+                if (!Number.isNaN(left) && !Number.isNaN(top)) {
+                    localStorage.setItem(storageKey, JSON.stringify({ left, top }));
+                }
+            } catch (_) {}
+        }
+
+
+        function makeDraggable(el, storageKey, handleEl) {
+            let dragging = false;
+            let startX = 0, startY = 0; // cursor
+            let origLeft = 0, origTop = 0; // element
+
+            const onMove = (e) => {
+                if (!dragging) return;
+                const clientX = e.touches ? e.touches[0].clientX : e.clientX;
+                const clientY = e.touches ? e.touches[0].clientY : e.clientY;
+                const dx = clientX - startX;
+                const dy = clientY - startY;
+                const w = el.offsetWidth;
+                const h = el.offsetHeight;
+                const maxX = window.innerWidth - w;
+                const maxY = window.innerHeight - h;
+                const newLeft = clamp(origLeft + dx, 0, maxX);
+                const newTop = clamp(origTop + dy, 0, maxY);
+                el.style.left = newLeft + 'px';
+                el.style.top = newTop + 'px';
+                el.style.right = 'auto';
+                el.style.bottom = 'auto';
+            };
+
+            const endDrag = () => {
+                if (!dragging) return;
+                dragging = false;
+                document.removeEventListener('mousemove', onMove);
+                document.removeEventListener('mouseup', endDrag);
+                document.removeEventListener('touchmove', onMove);
+                document.removeEventListener('touchend', endDrag);
+                handleEl && (handleEl.style.cursor = 'grab');
+                savePosition(el, storageKey);
+                // ensure no-overlap constraint after a drag
+                try { layoutWidgetsStackedBottomRight(); } catch (_) {}
+            };
+
+            const startDrag = (e) => {
+                // Start from element's current on-screen rect
+                const elRect = el.getBoundingClientRect();
+                el.style.left = elRect.left + 'px';
+                el.style.top = elRect.top + 'px';
+                el.style.right = 'auto';
+                el.style.bottom = 'auto';
+
+                dragging = true;
+                startX = e.touches ? e.touches[0].clientX : e.clientX;
+                startY = e.touches ? e.touches[0].clientY : e.clientY;
+                origLeft = elRect.left;
+                origTop = elRect.top;
+
+                document.addEventListener('mousemove', onMove);
+                document.addEventListener('mouseup', endDrag);
+                document.addEventListener('touchmove', onMove, { passive: false });
+                document.addEventListener('touchend', endDrag);
+                handleEl && (handleEl.style.cursor = 'grabbing');
+                e.preventDefault();
+            };
+
+            (handleEl || el).addEventListener('mousedown', startDrag);
+            (handleEl || el).addEventListener('touchstart', startDrag, { passive: false });
+
+            // Apply any saved position on init
+            restorePosition(el, storageKey);
+        }
+        function toggleCell(cellId) {
+            const codeElement = document.getElementById('code-' + cellId);
+            const outputElement = document.getElementById('output-' + cellId);
+            
+            if (codeElement) {
+                codeElement.classList.toggle('collapsed');
+            }
+            if (outputElement) {
+                outputElement.classList.toggle('collapsed');
+            }
+            
+            updateIndicators(cellId);
+            encodeToolStateToUrl();
+        }
+        
+        function toggleCode(cellId) {
+            const codeElement = document.getElementById('code-' + cellId);
+            if (codeElement) {
+                codeElement.classList.toggle('collapsed');
+                updateIndicators(cellId);
+                encodeToolStateToUrl();
+            }
+        }
+        
+        function toggleOutput(cellId) {
+            const outputElement = document.getElementById('output-' + cellId);
+            if (outputElement) {
+                outputElement.classList.toggle('collapsed');
+                updateIndicators(cellId);
+                encodeToolStateToUrl();
+            }
+        }
+
+        function toggleUvLogs(headerElement) {
+            const contentElement = headerElement.nextElementSibling;
+            if (contentElement) {
+                const isCollapsed = contentElement.style.display === 'none';
+                contentElement.style.display = isCollapsed ? 'block' : 'none';
+                headerElement.textContent = isCollapsed ? '▼ UV Install Logs' : '▶ UV Install Logs';
+
+                // Update the header indicator if it exists
+                const uvLogsDiv = headerElement.parentElement;
+                if (uvLogsDiv && uvLogsDiv.id && uvLogsDiv.id.startsWith('uv-logs-')) {
+                    const cellId = uvLogsDiv.id.replace('uv-logs-', '');
+                    const indicatorElement = document.getElementById('uv-indicator-' + cellId);
+                    if (indicatorElement) {
+                        indicatorElement.textContent = isCollapsed ? '▼ uv-logs' : '▶ uv-logs';
+                    }
+                }
+            }
+        }
+
+        function toggleUvLogsFromHeader(cellId) {
+            const uvLogsElement = document.getElementById('uv-logs-' + cellId);
+            const indicatorElement = document.getElementById('uv-indicator-' + cellId);
+            if (uvLogsElement) {
+                const headerElement = uvLogsElement.querySelector('.uv-logs-header');
+                const contentElement = uvLogsElement.querySelector('.uv-logs-content');
+                if (contentElement && headerElement) {
+                    const isCollapsed = contentElement.style.display === 'none';
+                    contentElement.style.display = isCollapsed ? 'block' : 'none';
+                    headerElement.textContent = isCollapsed ? '▼ UV Install Logs' : '▶ UV Install Logs';
+                    if (indicatorElement) {
+                        indicatorElement.textContent = isCollapsed ? '▼ uv-logs' : '▶ uv-logs';
+                    }
+                }
+            }
+        }
+        
+        function updateIndicators(cellId) {
+            const codeElement = document.getElementById('code-' + cellId);
+            const outputElement = document.getElementById('output-' + cellId);
+            const indicators = document.querySelector(`[onclick*="${cellId}"]`)?.closest('.cell-header')?.querySelector('.collapse-indicators');
+            
+            if (indicators) {
+                const codeCollapsed = codeElement && codeElement.classList.contains('collapsed');
+                const outputCollapsed = outputElement && outputElement.classList.contains('collapsed');
+                
+                const codeIcon = codeCollapsed ? '▶' : '▼';
+                const outputIcon = outputCollapsed ? '▶' : '▼';
+                
+                const codeSpan = indicators.querySelector('[onclick*="toggleCode"]');
+                const outputSpan = indicators.querySelector('[onclick*="toggleOutput"]');
+                
+                if (codeSpan) codeSpan.innerHTML = `${codeIcon} code`;
+                if (outputSpan) outputSpan.innerHTML = `${outputIcon} output`;
+            }
+        }
+        
+        function toggleTheme() {
+            const html = document.documentElement;
+            const currentTheme = html.getAttribute('data-theme');
+            const newTheme = currentTheme === 'dark' ? 'light' : 'dark';
+            html.setAttribute('data-theme', newTheme);
+            localStorage.setItem('uvnote-theme', newTheme);
+            updateThemeIcon();
+            updateUiDebug();
+        }
+        
+        // Two panel code removed
+        
+        function updateThemeIcon() {
+            const theme = document.documentElement.getAttribute('data-theme');
+            const toggle = document.querySelector('.theme-toggle');
+            if (toggle) {
+                toggle.textContent = theme === 'dark' ? 'light' : 'dark';
+            }
+        }
+        function setUiTheme(newUi) {
+            if (newUi !== 'default' && newUi !== 'none' && newUi !== 'monocolor') return;
+            const html = document.documentElement;
+            html.setAttribute('data-ui', newUi);
+            try { localStorage.setItem('uvnote-ui', newUi); } catch (_) {}
+            updateUiMenu();
+            updateUiDebug();
+        }
+        function updateUiMenu() {
+            const ui = document.documentElement.getAttribute('data-ui') || 'default';
+            const checks = {
+                default: document.getElementById('checkbox-ui-default'),
+                none: document.getElementById('checkbox-ui-none'),
+                monocolor: document.getElementById('checkbox-ui-monocolor')
+            };
+            if (checks.default) checks.default.textContent = ui === 'default' ? '☑' : '☐';
+            if (checks.none) checks.none.textContent = ui === 'none' ? '☑' : '☐';
+            if (checks.monocolor) checks.monocolor.textContent = ui === 'monocolor' ? '☑' : '☐';
+        }
+
+        function updateUiDebug() {
+            const ui = document.documentElement.getAttribute('data-ui') || 'default';
+            const color = document.documentElement.getAttribute('data-theme') || 'light';
+            const el = document.getElementById('ui-debug');
+            if (el) {
+                el.textContent = `UI: ${ui} | Color: ${color}`;
+            }
+        }
+
+        // Line selection and deep-linking
+        function clearLineSelections() {
+            try {
+                document.querySelectorAll('.code-line-highlight').forEach(el => { el.style.display = 'none'; });
+                document.querySelectorAll('.line-number.selected').forEach(el => el.classList.remove('selected'));
+            } catch (_) {}
+        }
+
+        let _selection = null; // { cellId, a, b }
+
+        function clearSelection(updateUrl) {
+            clearLineSelections();
+            _selection = null;
+            if (updateUrl) {
+                try {
+                    const url = new URL(window.location.href);
+                    url.searchParams.delete('cell');
+                    url.searchParams.delete('line');
+                    history.replaceState(null, '', url.toString());
+                } catch (_) {}
+            }
+            updateStateIndicator();
+        }
+
+        function selectCellLine(cellId, line, updateUrl) {
+            try {
+                // Ensure only one selection across the whole document
+                clearLineSelections();
+                const codeBox = document.getElementById(`code-${cellId}`);
+                if (!codeBox) return;
+                const pre = codeBox.querySelector('.highlight pre');
+                const overlay = document.getElementById(`line-highlight-${cellId}`);
+                const numbers = document.getElementById(`lines-${cellId}`);
+                if (!pre || !overlay) return;
+
+                // Measure line height directly from computed style
+                const preStyle = getComputedStyle(pre);
+                const padTop = parseFloat(preStyle.paddingTop || '0');
+                const lh = parseFloat(preStyle.lineHeight || '20');
+
+                // Position overlay
+                overlay.style.display = 'block';
+                overlay.style.height = `${lh}px`;
+                overlay.style.top = `${pre.offsetTop + padTop + (line - 1) * lh}px`;
+
+                // Update selected class in line numbers
+                if (numbers) {
+                    numbers.querySelectorAll('.line-number').forEach(a => a.classList.remove('selected'));
+                    const sel = numbers.querySelector(`.line-number[data-line="${line}"]`);
+                    if (sel) sel.classList.add('selected');
+                }
+
+                if (updateUrl) {
+                    const url = new URL(window.location.href);
+                    url.searchParams.set('cell', cellId);
+                    url.searchParams.set('line', String(line));
+                    history.replaceState(null, '', url.toString());
+                }
+                _selection = { cellId, a: line, b: line };
+                updateStateIndicator();
+            } catch (e) { console.warn('selectCellLine error', e); }
+        }
+
+        function selectCellLines(cellId, startLine, endLine, updateUrl) {
+            try {
+                // normalize order
+                const a = Math.min(startLine, endLine);
+                const b = Math.max(startLine, endLine);
+                clearLineSelections();
+                const codeBox = document.getElementById(`code-${cellId}`);
+                if (!codeBox) return;
+                const pre = codeBox.querySelector('.highlight pre');
+                const overlay = document.getElementById(`line-highlight-${cellId}`);
+                const numbers = document.getElementById(`lines-${cellId}`);
+                if (!pre || !overlay) return;
+
+                const preStyle = getComputedStyle(pre);
+                const padTop = parseFloat(preStyle.paddingTop || '0');
+                const lh = parseFloat(preStyle.lineHeight || '20');
+
+                overlay.style.display = 'block';
+                overlay.style.top = `${pre.offsetTop + padTop + (a - 1) * lh}px`;
+                overlay.style.height = `${(b - a + 1) * lh}px`;
+
+                if (numbers) {
+                    numbers.querySelectorAll('.line-number').forEach(a => a.classList.remove('selected'));
+                    for (let i = a; i <= b; i++) {
+                        const el = numbers.querySelector(`.line-number[data-line="${i}"]`);
+                        if (el) el.classList.add('selected');
+                    }
+                }
+
+                if (updateUrl) {
+                    const url = new URL(window.location.href);
+                    url.searchParams.set('cell', cellId);
+                    if (a === b) url.searchParams.set('line', String(a));
+                    else url.searchParams.set('line', `${a}-${b}`);
+                    history.replaceState(null, '', url.toString());
+                }
+                _selection = { cellId, a, b };
+                updateStateIndicator();
+            } catch (e) { console.warn('selectCellLines error', e); }
+        }
+
+        // Drag-to-select support on line numbers
+        let _lineDrag = { active: false, cellId: null, start: 0 };
+        function onLineNumberMouseDown(e) {
+            const a = e.target.closest('.line-number');
+            if (!a) return;
+            e.preventDefault();
+            const cellId = a.dataset.cell;
+            const line = parseInt(a.dataset.line || '1', 10) || 1;
+            // Toggle off if this exact single line is already the only selection
+            const numbers = document.getElementById(`lines-${cellId}`);
+            if (numbers) {
+                const selected = Array.from(numbers.querySelectorAll('.line-number.selected')).map(n => parseInt(n.dataset.line||'0',10)).filter(Boolean);
+                if (selected.length === 1 && selected[0] === line) {
+                    clearSelection(true);
+                    return;
+                }
+            }
+            _lineDrag.active = true;
+            _lineDrag.cellId = cellId;
+            _lineDrag.start = line;
+            selectCellLines(_lineDrag.cellId, _lineDrag.start, _lineDrag.start, false);
+        }
+        function onDocMouseMove(e) {
+            if (!_lineDrag.active) return;
+            const el = document.elementFromPoint(e.clientX, e.clientY);
+            if (!el) return;
+            const a = el.closest && el.closest('.line-number');
+            if (!a) return;
+            if (a.dataset.cell !== _lineDrag.cellId) return;
+            const cur = parseInt(a.dataset.line || '1', 10) || 1;
+            selectCellLines(_lineDrag.cellId, _lineDrag.start, cur, false);
+        }
+        function onDocMouseUp(e) {
+            if (!_lineDrag.active) return;
+            const last = document.querySelector('.line-number.selected:last-of-type');
+            // finalize URL using the current selected range
+            const numbers = document.getElementById(`lines-${_lineDrag.cellId}`);
+            if (numbers) {
+                const selected = Array.from(numbers.querySelectorAll('.line-number.selected')).map(n => parseInt(n.dataset.line||'0',10)).filter(Boolean);
+                if (selected.length) {
+                    const a = Math.min(...selected); const b = Math.max(...selected);
+                    selectCellLines(_lineDrag.cellId, a, b, true);
+                }
+            }
+            _lineDrag.active = false; _lineDrag.cellId = null; _lineDrag.start = 0;
+        }
+
+        function applyLocationFromUrl() {
+            try {
+                const url = new URL(window.location.href);
+                const cell = url.searchParams.get('cell');
+                const lineParam = url.searchParams.get('line');
+                if (cell && lineParam) {
+                    if (lineParam.includes('-')) {
+                        const [a, b] = lineParam.split('-').map(x => parseInt(x, 10));
+                        if (!Number.isNaN(a) && !Number.isNaN(b)) selectCellLines(cell, a, b, false);
+                    } else {
+                        const l = parseInt(lineParam, 10);
+                        if (!Number.isNaN(l)) selectCellLine(cell, l, false);
+                    }
+                }
+                
+                // Apply tool parameters from URL
+                applyToolsFromUrl(url.searchParams);
+                
+                // Cell states will be applied later in DOMContentLoaded with proper timing
+                const encodedCellStates = url.searchParams.get('cells');
+                console.log('Encoded cell states from URL:', encodedCellStates);
+            } catch (_) {}
+        }
+        
+        function applyToolsFromUrl(params) {
+            try {
+                // Check if tools widget should be shown
+                const showTools = params.get('tools');
+                if (showTools === '1') {
+                    // Mark that tool was loaded from URL
+                    _urlLoadedTool = true;
+                    
+                    // Apply color
+                    const color = params.get('color');
+                    if (color && /^[0-9a-fA-F]{6}$/.test(color)) {
+                        setStoredArrowColor('#' + color);
+                    }
+                    
+                    // Apply thickness
+                    const thickness = params.get('thickness');
+                    if (thickness) {
+                        const value = parseInt(thickness, 10);
+                        if (value >= 1 && value <= 10) {
+                            setStoredLineThickness(value);
+                        }
+                    }
+                    
+                    // Don't override fadeout time for URL-loaded tools - let individual shapes decide
+                    
+                    // Load shapes from URL
+                    const encodedShapes = params.get('shapes');
+                    if (encodedShapes) {
+                        const decodedShapes = decodeShapesFromUrl(encodedShapes);
+                        if (decodedShapes.length > 0) {
+                            _shapes = decodedShapes;
+                            saveShapes();
+                            // Trigger render after overlay is initialized
+                            setTimeout(() => {
+                                renderOverlay();
+                            }, 300);
+                        }
+                    }
+                    
+                    // Wait for widgets to be initialized before showing tools
+                    setTimeout(() => {
+                        const toolsWidget = document.querySelector('.tools-widget');
+                        const checkbox = document.getElementById('checkbox-tools');
+                        if (toolsWidget && checkbox) {
+                            toolsWidget.style.display = 'block';
+                            checkbox.textContent = '☑';
+                            localStorage.setItem('uvnote-widget-tools', 'visible');
+                        }
+                        
+                        // Apply active tool
+                        const activeTool = params.get('tool');
+                        if (activeTool && ['arrow', 'pen', 'eraser', 'spotlight'].includes(activeTool)) {
+                            const toolBtn = Array.from(document.querySelectorAll('.tool-button')).find(btn => btn.textContent === activeTool);
+                            if (toolBtn) {
+                                toolBtn.click();
+                            }
+                        }
+                        
+                        // Re-layout widgets after showing tools
+                        layoutWidgetsStackedBottomRight();
+                    }, 200);
+                }
+            } catch (_) {}
+        }
+        
+        function captureInitialCellStates() {
+            const cells = document.querySelectorAll('.cell');
+            cells.forEach(cell => {
+                const cellId = cell.id.replace('cell-', '');
+                const codeEl = document.getElementById('code-' + cellId);
+                const outputEl = document.getElementById('output-' + cellId);
+                
+                if (codeEl || outputEl) {
+                    const state = {};
+                    if (codeEl) {
+                        state.c = codeEl.classList.contains('collapsed') ? 0 : 1;
+                    }
+                    if (outputEl) {
+                        state.o = outputEl.classList.contains('collapsed') ? 0 : 1;
+                    }
+                    _initialCellStates[cellId] = state;
+                }
+            });
+            console.log('Captured initial cell states:', _initialCellStates);
+        }
+
+        function encodeCellStatesToUrl() {
+            // Get all cells and their collapse states
+            const cells = document.querySelectorAll('.cell');
+            const cellStates = {};
+            
+            console.log('Found cells:', cells.length);
+            
+            cells.forEach(cell => {
+                const cellId = cell.id.replace('cell-', '');
+                const codeEl = document.getElementById('code-' + cellId);
+                const outputEl = document.getElementById('output-' + cellId);
+                const initialState = _initialCellStates[cellId] || {};
+                
+                console.log(`Encoding cell ${cellId}:`, {
+                    codeEl: !!codeEl,
+                    outputEl: !!outputEl,
+                    codeCollapsed: codeEl ? codeEl.classList.contains('collapsed') : 'N/A',
+                    outputCollapsed: outputEl ? outputEl.classList.contains('collapsed') : 'N/A',
+                    initialState: initialState
+                });
+                
+                if (codeEl || outputEl) {
+                    const state = {};
+                    let hasChanges = false;
+                    
+                    if (codeEl) {
+                        const currentCodeState = codeEl.classList.contains('collapsed') ? 0 : 1;
+                        const initialCodeState = initialState.c;
+                        // Only encode if different from initial state
+                        if (initialCodeState !== undefined && currentCodeState !== initialCodeState) {
+                            state.c = currentCodeState;
+                            hasChanges = true;
+                        }
+                    }
+                    
+                    if (outputEl) {
+                        const currentOutputState = outputEl.classList.contains('collapsed') ? 0 : 1;
+                        const initialOutputState = initialState.o;
+                        // Only encode if different from initial state
+                        if (initialOutputState !== undefined && currentOutputState !== initialOutputState) {
+                            state.o = currentOutputState;
+                            hasChanges = true;
+                        }
+                    }
+                    
+                    // Only include cell if it has changes from initial state
+                    if (hasChanges) {
+                        cellStates[cellId] = state;
+                        console.log(`Added cell ${cellId}:`, state);
+                    }
+                }
+            });
+            
+            console.log('Final cell states to encode:', cellStates);
+            
+            // Return empty string if no changed cells
+            if (Object.keys(cellStates).length === 0) return '';
+            
+            // Encode as compact base64 string
+            const encoded = btoa(JSON.stringify(cellStates));
+            console.log('Encoded cell states:', encoded);
+            return encoded;
+        }
+        
+        function decodeCellStatesFromUrl(encodedStates) {
+            if (!encodedStates) return {};
+            
+            try {
+                return JSON.parse(atob(encodedStates));
+            } catch (e) {
+                console.error('Failed to decode cell states:', e);
+                return {};
+            }
+        }
+        
+        function applyCellStatesFromUrl(cellStates) {
+            console.log('Applying cell states from URL:', cellStates);
+            Object.entries(cellStates).forEach(([cellId, state]) => {
+                const codeEl = document.getElementById('code-' + cellId);
+                const outputEl = document.getElementById('output-' + cellId);
+                
+                console.log(`Cell ${cellId}:`, {
+                    codeEl: !!codeEl,
+                    outputEl: !!outputEl,
+                    state: state
+                });
+                
+                if (codeEl && state.c !== undefined) {
+                    if (state.c === 0) {
+                        codeEl.classList.add('collapsed');
+                        console.log(`Collapsed code for cell ${cellId}`, {
+                            hasCollapsedClass: codeEl.classList.contains('collapsed'),
+                            computedDisplay: getComputedStyle(codeEl).display,
+                            classList: Array.from(codeEl.classList),
+                            elementId: codeEl.id
+                        });
+                    } else {
+                        codeEl.classList.remove('collapsed');
+                        codeEl.classList.add('expanded'); // Explicitly add expanded class
+                        console.log(`Expanded code for cell ${cellId}`, {
+                            hasCollapsedClass: codeEl.classList.contains('collapsed'),
+                            hasExpandedClass: codeEl.classList.contains('expanded'),
+                            computedDisplay: getComputedStyle(codeEl).display,
+                            classList: Array.from(codeEl.classList),
+                            elementId: codeEl.id
+                        });
+                    }
+                }
+                
+                if (outputEl && state.o !== undefined) {
+                    if (state.o === 0) {
+                        outputEl.classList.add('collapsed');
+                        console.log(`Collapsed output for cell ${cellId}`);
+                    } else {
+                        outputEl.classList.remove('collapsed');
+                        console.log(`Expanded output for cell ${cellId}`);
+                    }
+                }
+                
+                // Update visual indicators and force style recalculation
+                try {
+                    updateIndicators(cellId);
+                    // Force browser to recalculate styles
+                    if (codeEl) {
+                        codeEl.offsetHeight; // Force reflow
+                        console.log(`After indicators update - code visible: ${getComputedStyle(codeEl).display !== 'none'}`);
+                    }
+                    if (outputEl) {
+                        outputEl.offsetHeight; // Force reflow
+                        console.log(`After indicators update - output visible: ${getComputedStyle(outputEl).display !== 'none'}`);
+                    }
+                } catch (e) {
+                    console.error(`Error updating indicators for cell ${cellId}:`, e);
+                }
+            });
+        }
+
+        function encodeShapesToUrl() {
+            // Encode shapes as compact base64 string
+            if (_shapes.length === 0) return '';
+            
+            const shapeData = _shapes.map(shape => {
+                const baseData = {
+                    ct: shape.createdAt, // creation timestamp
+                    fo: shape.fadeoutTime || getFadeoutTime() // fadeout time for this shape
+                };
+                
+                if (shape.type === 'arrow') {
+                    return {
+                        ...baseData,
+                        t: 'a',
+                        x1: Math.round(shape.x1),
+                        y1: Math.round(shape.y1),
+                        x2: Math.round(shape.x2),
+                        y2: Math.round(shape.y2),
+                        c: shape.color.substring(1), // remove #
+                        w: shape.width
+                    };
+                } else if (shape.type === 'pen') {
+                    return {
+                        ...baseData,
+                        t: 'p',
+                        pts: shape.points.map(p => [Math.round(p.x), Math.round(p.y)]),
+                        c: shape.color.substring(1),
+                        w: shape.width
+                    };
+                } else if (shape.type === 'spotlight') {
+                    return {
+                        ...baseData,
+                        t: 's',
+                        x: Math.round(shape.x),
+                        y: Math.round(shape.y),
+                        r: Math.round(shape.radius)
+                    };
+                }
+            }).filter(Boolean);
+            
+            return btoa(JSON.stringify(shapeData));
+        }
+        
+        function decodeShapesFromUrl(encodedShapes) {
+            if (!encodedShapes) return [];
+            
+            try {
+                const shapeData = JSON.parse(atob(encodedShapes));
+                return shapeData.map(data => {
+                    const base = {
+                        createdAt: data.ct || Date.now(), // use encoded timestamp or current time
+                        fadeoutTime: data.fo || 0, // use encoded fadeout time or 0 (never fade)
+                        opacity: 1.0
+                    };
+                    
+                    if (data.t === 'a') {
+                        return {
+                            ...base,
+                            type: 'arrow',
+                            x1: data.x1,
+                            y1: data.y1,
+                            x2: data.x2,
+                            y2: data.y2,
+                            color: '#' + data.c,
+                            width: data.w
+                        };
+                    } else if (data.t === 'p') {
+                        return {
+                            ...base,
+                            type: 'pen',
+                            points: data.pts.map(([x, y]) => ({ x, y })),
+                            color: '#' + data.c,
+                            width: data.w
+                        };
+                    } else if (data.t === 's') {
+                        return {
+                            ...base,
+                            type: 'spotlight',
+                            x: data.x,
+                            y: data.y,
+                            radius: data.r,
+                            color: '#000000'
+                        };
+                    }
+                }).filter(Boolean);
+            } catch (e) {
+                console.error('Failed to decode shapes:', e);
+                return [];
+            }
+        }
+        
+        function encodeToolStateToUrl() {
+            // Don't update URL during initialization
+            if (_isInitializing) {
+                return window.location.href;
+            }
+            
+            const params = new URLSearchParams(window.location.search);
+            
+            // Check if tools widget is visible and has an active tool
+            const toolsWidget = document.querySelector('.tools-widget');
+            const activeTool = document.body.dataset.tool;
+            const hasActiveTool = activeTool && activeTool !== 'none';
+            const toolsWidgetVisible = toolsWidget && getComputedStyle(toolsWidget).display !== 'none';
+            
+            // Always handle shapes regardless of tool state
+            const encodedShapes = encodeShapesToUrl();
+            if (encodedShapes) {
+                params.set('shapes', encodedShapes);
+            } else {
+                params.delete('shapes');
+            }
+            
+            // Always preserve existing cell states from URL if present
+            const existingCellStates = params.get('cells');
+            if (existingCellStates) {
+                // Keep existing cell states - don't re-encode from DOM
+                params.set('cells', existingCellStates);
+            } else {
+                // Only encode new cell states if none exist in URL
+                const encodedCellStates = encodeCellStatesToUrl();
+                if (encodedCellStates) {
+                    params.set('cells', encodedCellStates);
+                }
+            }
+            
+            if (toolsWidgetVisible && hasActiveTool) {
+                // Include tool params when widget is visible AND tool is active
+                params.set('tools', '1');
+                params.set('tool', activeTool);
+                
+                // Get color (without # prefix)
+                const color = getArrowColor();
+                if (color && color.startsWith('#')) {
+                    params.set('color', color.substring(1));
+                }
+                
+                // Get thickness
+                const thickness = getLineThickness();
+                params.set('thickness', thickness.toString());
+            } else {
+                // Remove tool state params but keep shapes
+                params.delete('tools');
+                params.delete('tool');
+                params.delete('color');
+                params.delete('thickness');
+                params.delete('fadeout');
+            }
+            
+            // Update URL without reloading
+            const newUrl = window.location.pathname + (params.toString() ? '?' + params.toString() : '') + window.location.hash;
+            window.history.replaceState(null, '', newUrl);
+            
+            return window.location.href;
+        }
+
+        function resetLayout() {
+            try {
+                // Clear all uvnote-* keys
+                const allKeys = Object.keys(localStorage);
+                const uvnoteKeys = allKeys.filter(key => key.startsWith('uvnote-'));
+                uvnoteKeys.forEach(k => localStorage.removeItem(k));
+            } catch (_) {}
+            
+            // Clear any active selection and remove URL params
+            try { clearSelection(true); } catch(_) {}
+            // Reset active tool if any
+            try { window.setActiveTool('none'); } catch(_) {}
+            // Clear shapes
+            try { _shapes = []; saveShapes(); } catch(_) {}
+            // Reset URL-loaded tool flag
+            try { _urlLoadedTool = false; } catch(_) {}
+            // Reset all cells to expanded state
+            try {
+                const cells = document.querySelectorAll('.cell');
+                cells.forEach(cell => {
+                    const cellId = cell.id.replace('cell-', '');
+                    const codeEl = document.getElementById('code-' + cellId);
+                    const outputEl = document.getElementById('output-' + cellId);
+                    if (codeEl) codeEl.classList.remove('collapsed');
+                    if (outputEl) outputEl.classList.remove('collapsed');
+                    updateIndicators(cellId);
+                });
+            } catch(_) {}
+            
+            // Clear ALL URL parameters and reload with clean URL
+            try {
+                const cleanUrl = window.location.pathname + window.location.hash;
+                window.location.href = cleanUrl; // Use window.location.href instead of history.replaceState + reload
+            } catch (_) {
+                // Fallback - reload current page
+                location.reload();
+            }
+        }
+
+        function toggleMenu() {
+            const menuButton = document.querySelector('.menu-button');
+            if (menuButton) {
+                menuButton.classList.toggle('active');
+            }
+        }
+
+        function toggleWidget(widgetName) {
+            let widget;
+            let checkbox;
+            
+            // Close the menu first
+            const menuButton = document.querySelector('.menu-button');
+            if (menuButton) {
+                menuButton.classList.remove('active');
+            }
+            
+            switch(widgetName) {
+                case 'tools':
+                    widget = document.querySelector('.tools-widget');
+                    checkbox = document.getElementById('checkbox-tools');
+                    break;
+                case 'file-explorer':
+                    widget = document.querySelector('.file-explorer');
+                    checkbox = document.getElementById('checkbox-file-explorer');
+                    break;
+                case 'minimap':
+                    widget = document.querySelector('.minimap');
+                    checkbox = document.getElementById('checkbox-minimap');
+                    break;
+                case 'status':
+                    widget = document.querySelector('.status-widget');
+                    checkbox = document.getElementById('checkbox-status');
+                    break;
+                default:
+                    return;
+            }
+            
+            if (widget && checkbox) {
+                const isVisible = getComputedStyle(widget).display !== 'none';
+                widget.style.display = isVisible ? 'none' : 'block';
+                checkbox.textContent = isVisible ? '☐' : '☑';
+                
+                // Save state to localStorage
+                try {
+                    localStorage.setItem(`uvnote-widget-${widgetName}`, isVisible ? 'hidden' : 'visible');
+                } catch (_) {}
+                
+                // Re-layout widgets after visibility change
+                try { 
+                    layoutWidgetsStackedBottomRight(); 
+                } catch (_) {}
+                
+                // Update URL when tools widget visibility changes
+                if (widgetName === 'tools') {
+                    encodeToolStateToUrl();
+                }
+            }
+        }
+        
+        function initializeWidgetVisibility() {
+            const widgets = [
+                { name: 'tools', selector: '.tools-widget' },
+                { name: 'file-explorer', selector: '.file-explorer' },
+                { name: 'minimap', selector: '.minimap' },
+                { name: 'status', selector: '.status-widget' }
+            ];
+            
+            widgets.forEach(({ name, selector }) => {
+                const defaultState = name === 'status' ? 'visible' : 'hidden';
+                const savedState = localStorage.getItem(`uvnote-widget-${name}`) || defaultState;
+                const widget = document.querySelector(selector);
+                const checkbox = document.getElementById(`checkbox-${name}`);
+                
+                if (widget && checkbox) {
+                    const isVisible = savedState === 'visible';
+                    widget.style.display = isVisible ? 'block' : 'none';
+                    checkbox.textContent = isVisible ? '☑' : '☐';
+                }
+            });
+        }
+
+        // Close menu when clicking outside
+        document.addEventListener('click', function(event) {
+            const menuButton = document.querySelector('.menu-button');
+            // Don't close if clicking on a menu item (let the item handler close it)
+            if (menuButton && !menuButton.contains(event.target)) {
+                menuButton.classList.remove('active');
+            }
+        });
+
+        // Layout: stack widgets bottom-right and equalize widths
+        function hasCustomWidgetPositions() {
+            try {
+                return (
+                    localStorage.getItem('uvnote-minimap-pos') ||
+                    localStorage.getItem('uvnote-file-explorer-pos') ||
+                    localStorage.getItem('uvnote-tools-pos')
+                );
+            } catch (_) { return false; }
+        }
+
+        function rectsOverlap(r1, r2) {
+            return !(r1.right <= r2.left || r2.right <= r1.left || r1.bottom <= r2.top || r2.bottom <= r1.top);
+        }
+
+        function widgetsOverlap(widgets) {
+            for (let i = 0; i < widgets.length; i++) {
+                const a = widgets[i];
+                const ra = a.getBoundingClientRect();
+                for (let j = i + 1; j < widgets.length; j++) {
+                    const b = widgets[j];
+                    const rb = b.getBoundingClientRect();
+                    if (rectsOverlap(ra, rb)) return true;
+                }
+            }
+            return false;
+        }
+
+        function applyStackLayout(widgets, order) {
+            if (!widgets.length) return;
+            // Fixed equal width
+            const fixedWidth = 220;
+            widgets.forEach(el => { el.style.width = fixedWidth + 'px'; });
+
+            // Fit heights if needed to avoid overflow
+            const gap = 12;
+            const available = Math.max(0, window.innerHeight - 40 - gap * (order.length - 1));
+            const eachMax = Math.floor(available / order.length);
+            order.forEach(el => {
+                el.style.maxHeight = eachMax + 'px';
+                el.style.overflowY = 'auto';
+            });
+
+            // Stack bottom-up in the requested order
+            let bottomOffset = 20; // base gutter
+            order.forEach(el => {
+                el.style.left = 'auto';
+                el.style.top = 'auto';
+                el.style.right = '20px';
+                el.style.bottom = bottomOffset + 'px';
+                bottomOffset += el.offsetHeight + gap;
+            });
+        }
+
+        function layoutWidgetsStackedBottomRight() {
+            const minimap = document.querySelector('.minimap');
+            const fileExplorer = document.querySelector('.file-explorer');
+            const tools = document.querySelector('.tools-widget');
+            const status = document.querySelector('.status-widget');
+            const widgets = [minimap, fileExplorer, tools, status].filter(el => el && getComputedStyle(el).display !== 'none');
+            if (!widgets.length) return;
+
+            const order = [minimap, fileExplorer, tools, status].filter(Boolean).filter(el => getComputedStyle(el).display !== 'none');
+
+            // If user placed custom positions and there is no overlap, respect them.
+            if (hasCustomWidgetPositions() && !widgetsOverlap(widgets)) return;
+
+            applyStackLayout(widgets, order);
+        }
+        
+        // Panel icon removed
+        
+        let _minimapScrollContainer = null;
+        let _minimapScrollHandler = null;
+        function initMinimap() {
+            // Generate minimap content
+            const minimap = createMinimap();
+            document.body.appendChild(minimap);
+            // Make draggable (use title as handle)
+            const mTitle = minimap.querySelector('.minimap-title');
+            makeDraggable(minimap, 'uvnote-minimap-pos', mTitle);
+
+            // Attach scroll listener to window (two-panel removed)
+            _minimapScrollContainer = window;
+
+            if (_minimapScrollContainer) {
+                _minimapScrollHandler = () => updateMinimapActive();
+                if (_minimapScrollContainer === window) {
+                    window.addEventListener('scroll', _minimapScrollHandler);
+                } else {
+                    _minimapScrollContainer.addEventListener('scroll', _minimapScrollHandler);
+                }
+            }
+            updateMinimapActive();
+        }
+
+        function teardownMinimap() {
+            const minimap = document.querySelector('.minimap');
+            if (minimap && minimap.parentNode) minimap.parentNode.removeChild(minimap);
+            if (_minimapScrollContainer && _minimapScrollHandler) {
+                if (_minimapScrollContainer === window) {
+                    window.removeEventListener('scroll', _minimapScrollHandler);
+                } else {
+                    _minimapScrollContainer.removeEventListener('scroll', _minimapScrollHandler);
+                }
+            }
+            _minimapScrollContainer = null;
+            _minimapScrollHandler = null;
+        }
+        
+        function initFileExplorer() {
+            // Generate file explorer content
+            const fileExplorer = createFileExplorer();
+            document.body.appendChild(fileExplorer);
+        }
+        
+        function createMinimap() {
+            const minimap = document.createElement('div');
+            minimap.className = 'minimap';
+            
+            const title = document.createElement('div');
+            title.className = 'minimap-title';
+            title.textContent = 'navigation';
+            minimap.appendChild(title);
+            
+            // Find all headings and cells
+            const root = document.querySelector('.main-content') || document;
+            const headings = root.querySelectorAll('h1, h2, h3, h4, h5, h6');
+            const cells = root.querySelectorAll('.cell');
+            
+            // Combine and sort by position
+            const items = [];
+            
+            headings.forEach(heading => {
+                const id = heading.id || generateId(heading.textContent);
+                if (!heading.id) heading.id = id;
+                
+                items.push({
+                    element: heading,
+                    type: 'heading',
+                    level: parseInt(heading.tagName.charAt(1)),
+                    text: heading.textContent.trim(),
+                    id: id,
+                    position: heading.getBoundingClientRect().top + window.scrollY
+                });
+            });
+            
+            cells.forEach(cell => {
+                const header = cell.querySelector('.cell-header');
+                if (header) {
+                    const id = cell.id || `cell-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`;
+                    if (!cell.id) cell.id = id;
+                    
+                    items.push({
+                        element: cell,
+                        type: 'cell',
+                        text: header.textContent.trim(),
+                        id: id,
+                        position: cell.getBoundingClientRect().top + window.scrollY
+                    });
+                }
+            });
+            
+            // Sort by position
+            items.sort((a, b) => a.position - b.position);
+            
+            // Create minimap items
+            items.forEach(item => {
+                const link = document.createElement('a');
+                link.className = `minimap-item ${item.type === 'heading' ? 'minimap-heading' : 'minimap-cell'}`;
+                if (item.type === 'heading') {
+                    link.classList.add(`h${item.level}`);
+                }
+                link.textContent = item.text.length > 25 ? item.text.substring(0, 22) + '...' : item.text;
+                link.href = `#${item.id}`;
+                link.onclick = function(e) {
+                    e.preventDefault();
+                    item.element.scrollIntoView({ behavior: 'smooth', block: 'start' });
+                };
+                minimap.appendChild(link);
+            });
+            
+            return minimap;
+        }
+        
+        function generateId(text) {
+            return text.toLowerCase()
+                .replace(/[^a-z0-9]+/g, '-')
+                .replace(/^-+|-+$/g, '')
+                .substring(0, 20);
+        }
+        
+        function updateMinimapActive() {
+            const minimapItems = document.querySelectorAll('.minimap-item');
+            const container = _minimapScrollContainer || window;
+            const containerRect = container === window ? null : container.getBoundingClientRect();
+            const scrollPos = (container === window ? window.scrollY : container.scrollTop) + 100; // Offset for better detection
+            
+            let activeItem = null;
+            minimapItems.forEach(item => {
+                const targetId = item.getAttribute('href').substring(1);
+                const target = document.getElementById(targetId);
+                
+                if (target) {
+                    const rectTop = target.getBoundingClientRect().top;
+                    const targetPos = (container === window)
+                        ? rectTop + window.scrollY
+                        : rectTop - containerRect.top + container.scrollTop;
+                    if (targetPos <= scrollPos) {
+                        activeItem = item;
+                    }
+                }
+                
+                item.classList.remove('active');
+            });
+            
+            if (activeItem) {
+                activeItem.classList.add('active');
+            }
+        }
+        
+        function createFileExplorer() {
+            const fileExplorer = document.createElement('div');
+            fileExplorer.className = 'file-explorer';
+            
+            const title = document.createElement('div');
+            title.className = 'file-explorer-title';
+            title.textContent = 'files';
+            fileExplorer.appendChild(title);
+            // Make draggable (use title as handle)
+            makeDraggable(fileExplorer, 'uvnote-file-explorer-pos', title);
+            
+            // Scripts section
+            const scriptsSection = document.createElement('div');
+            scriptsSection.className = 'file-explorer-section';
+            
+            const scriptsTitle = document.createElement('div');
+            scriptsTitle.className = 'file-explorer-section-title';
+            scriptsTitle.textContent = 'scripts';
+            scriptsSection.appendChild(scriptsTitle);
+            
+            // Find all cells and list their script files (single panel)
+            const root = document.querySelector('.main-content') || document;
+            const cells = root.querySelectorAll('.cell');
+            cells.forEach(cell => {
+                const header = cell.querySelector('.cell-header');
+                if (header) {
+                    const cellText = header.textContent.trim();
+                    const cellMatch = cellText.match(/Cell: ([a-zA-Z_][a-zA-Z0-9_]*)/);
+                    if (cellMatch) {
+                        const cellId = cellMatch[1];
+                        const scriptItem = document.createElement('div');
+                        scriptItem.className = 'file-explorer-item script';
+                        scriptItem.textContent = `${cellId}.py`;
+                        scriptItem.onclick = function() {
+                            cell.scrollIntoView({ behavior: 'smooth', block: 'start' });
+                        };
+                        scriptsSection.appendChild(scriptItem);
+                    }
+                }
+            });
+            
+            fileExplorer.appendChild(scriptsSection);
+            
+            // Artifacts section
+            const artifactsSection = document.createElement('div');
+            artifactsSection.className = 'file-explorer-section';
+            
+            const artifactsTitle = document.createElement('div');
+            artifactsTitle.className = 'file-explorer-section-title';
+            artifactsTitle.textContent = 'artifacts';
+            artifactsSection.appendChild(artifactsTitle);
+            
+            // Find all artifact links (single panel)
+            const artifactsRoot = document.querySelector('.main-content') || document;
+            const artifacts = artifactsRoot.querySelectorAll('.artifact');
+            if (artifacts.length === 0) {
+                const noArtifacts = document.createElement('div');
+                noArtifacts.className = 'file-explorer-item artifact';
+                noArtifacts.textContent = '(none)';
+                noArtifacts.style.opacity = '0.5';
+                artifactsSection.appendChild(noArtifacts);
+            } else {
+                artifacts.forEach(artifact => {
+                    const artifactItem = document.createElement('div');
+                    artifactItem.className = 'file-explorer-item artifact';
+                    artifactItem.textContent = artifact.textContent;
+                    artifactItem.onclick = function() {
+                        artifact.click();
+                    };
+                    artifactsSection.appendChild(artifactItem);
+                });
+            }
+            
+            fileExplorer.appendChild(artifactsSection);
+            
+            return fileExplorer;
+        }
+
+        function initStatusWidget() {
+            let el = document.querySelector('.status-widget');
+            if (!el) {
+                el = document.createElement('div');
+                el.className = 'status-widget';
+                el.id = 'status-widget';
+                el.textContent = 'ready — Esc';
+                document.body.appendChild(el);
+            }
+        }
+
+        // Tools widget
+        let _cursorX = 0;
+        let _cursorY = 0;
+        let _cursorVisible = false;
+        
+        function setActiveTool(tool) {
+            if (!tool || tool === 'none') {
+                document.body.dataset.tool = 'none';
+                localStorage.setItem('uvnote-active-tool', 'none');
+                setOverlayActive(false);
+                _cursorVisible = false;
+                // Remove active class from all tool buttons when deactivating
+                const toolButtons = document.querySelectorAll('.tools-widget .tool-button');
+                toolButtons.forEach(btn => btn.classList.remove('active'));
+                updateStateIndicator();
+                encodeToolStateToUrl();
+                return;
+            }
+            document.body.dataset.tool = tool;
+            localStorage.setItem('uvnote-active-tool', tool);
+            setOverlayActive(true);
+            _cursorVisible = true;
+            updateStateIndicator();
+            encodeToolStateToUrl();
+        }
+
+        // Make setActiveTool globally accessible for ESC key handler
+        window.setActiveTool = setActiveTool;
+        
+
+        function getArrowColor() {
+            const saved = localStorage.getItem('uvnote-arrow-color');
+            if (saved) return saved;
+            return '#e53935'; // Default red color
+        }
+
+        function setStoredArrowColor(color) {
+            try { localStorage.setItem('uvnote-arrow-color', color); } catch (_) {}
+        }
+
+        function getLineThickness() {
+            const saved = localStorage.getItem('uvnote-line-thickness');
+            if (saved) return parseInt(saved, 10);
+            return 6; // default thickness
+        }
+
+        function setStoredLineThickness(thickness) {
+            try { localStorage.setItem('uvnote-line-thickness', thickness); } catch (_) {}
+        }
+
+        function getFadeoutTime() {
+            const saved = localStorage.getItem('uvnote-fadeout-time');
+            if (saved) return parseInt(saved, 10);
+            return 5; // default 5 seconds
+        }
+
+        function setStoredFadeoutTime(seconds) {
+            try { localStorage.setItem('uvnote-fadeout-time', seconds); } catch (_) {}
+        }
+
+        function createToolsWidget() {
+            const tools = document.createElement('div');
+            tools.className = 'tools-widget';
+
+            const title = document.createElement('div');
+            title.className = 'tools-title';
+            title.textContent = 'tools';
+            tools.appendChild(title);
+
+            const row = document.createElement('div');
+            row.className = 'tools-row';
+            tools.appendChild(row);
+
+            // Arrow tool
+            const arrowBtn = document.createElement('div');
+            arrowBtn.className = 'tool-button';
+            arrowBtn.textContent = 'arrow';
+            arrowBtn.onclick = function() {
+                const isActive = arrowBtn.classList.contains('active');
+                if (isActive) {
+                    arrowBtn.classList.remove('active');
+                    setActiveTool('none');
+                } else {
+                    tools.querySelectorAll('.tool-button').forEach(b => b.classList.remove('active'));
+                    arrowBtn.classList.add('active');
+                    setActiveTool('arrow');
+                }
+            };
+            row.appendChild(arrowBtn);
+
+            // Pen tool
+            const penBtn = document.createElement('div');
+            penBtn.className = 'tool-button';
+            penBtn.textContent = 'pen';
+            penBtn.onclick = function() {
+                const isActive = penBtn.classList.contains('active');
+                if (isActive) {
+                    penBtn.classList.remove('active');
+                    setActiveTool('none');
+                } else {
+                    tools.querySelectorAll('.tool-button').forEach(b => b.classList.remove('active'));
+                    penBtn.classList.add('active');
+                    setActiveTool('pen');
+                }
+            };
+            row.appendChild(penBtn);
+
+            // Eraser tool
+            const eraseBtn = document.createElement('div');
+            eraseBtn.className = 'tool-button';
+            eraseBtn.textContent = 'eraser';
+            eraseBtn.onclick = function() {
+                const isActive = eraseBtn.classList.contains('active');
+                if (isActive) {
+                    eraseBtn.classList.remove('active');
+                    setActiveTool('none');
+                } else {
+                    tools.querySelectorAll('.tool-button').forEach(b => b.classList.remove('active'));
+                    eraseBtn.classList.add('active');
+                    setActiveTool('eraser');
+                }
+            };
+            row.appendChild(eraseBtn);
+
+            // Spotlight tool
+            const spotlightBtn = document.createElement('div');
+            spotlightBtn.className = 'tool-button';
+            spotlightBtn.textContent = 'spotlight';
+            spotlightBtn.onclick = function() {
+                const isActive = spotlightBtn.classList.contains('active');
+                if (isActive) {
+                    spotlightBtn.classList.remove('active');
+                    setActiveTool('none');
+                } else {
+                    tools.querySelectorAll('.tool-button').forEach(b => b.classList.remove('active'));
+                    spotlightBtn.classList.add('active');
+                    setActiveTool('spotlight');
+                }
+            };
+            row.appendChild(spotlightBtn);
+
+            // Clear all
+            const clearBtn = document.createElement('div');
+            clearBtn.className = 'tool-button';
+            clearBtn.textContent = 'clear';
+            clearBtn.onclick = function() {
+                _shapes = [];
+                saveShapes();
+                renderOverlay();
+            };
+            row.appendChild(clearBtn);
+            
+            // We'll add the copy button at the end of the widget
+
+            // Restore active state from storage
+            const saved = localStorage.getItem('uvnote-active-tool') || 'none';
+            if (saved === 'arrow') {
+                arrowBtn.classList.add('active');
+                setActiveTool('arrow');
+            } else if (saved === 'pen') {
+                penBtn.classList.add('active');
+                setActiveTool('pen');
+            } else if (saved === 'eraser') {
+                eraseBtn.classList.add('active');
+                setActiveTool('eraser');
+            } else if (saved === 'spotlight') {
+                spotlightBtn.classList.add('active');
+                setActiveTool('spotlight');
+            }
+
+            // Color selector
+            const colorTitle = document.createElement('div');
+            colorTitle.className = 'tools-section-title';
+            colorTitle.textContent = 'color';
+            tools.appendChild(colorTitle);
+
+            const colorRow = document.createElement('div');
+            colorRow.className = 'tools-row color-row';
+            tools.appendChild(colorRow);
+
+            const swatchColors = [
+                // Primary colors
+                '#e53935', '#fb8c00', '#fdd835', '#43a047', '#1e88e5', '#8e24aa',
+                // Additional useful colors  
+                '#ff5722', '#795548', '#607d8b', '#9c27b0',
+                // Grayscale
+                '#000000', '#424242', '#9e9e9e', '#ffffff'
+            ];
+            const swatches = [];
+            swatchColors.forEach(c => {
+                const s = document.createElement('div');
+                s.className = 'color-swatch';
+                s.style.backgroundColor = c;
+                s.title = c;
+                s.onclick = () => {
+                    setStoredArrowColor(c);
+                    refreshColorUI(c);
+                    if (_cursorVisible) renderOverlay();
+                    encodeToolStateToUrl();
+                };
+                colorRow.appendChild(s);
+                swatches.push(s);
+            });
+
+            const colorInput = document.createElement('input');
+            colorInput.type = 'color';
+            colorInput.className = 'color-input';
+            colorInput.oninput = () => {
+                setStoredArrowColor(colorInput.value);
+                refreshColorUI(colorInput.value);
+                if (_cursorVisible) renderOverlay();
+                encodeToolStateToUrl();
+            };
+            colorRow.appendChild(colorInput);
+
+            function refreshColorUI(selected) {
+                const selectedHex = selected.startsWith('#') ? selected.toLowerCase() : rgbToHex(selected);
+                
+                swatches.forEach((s, i) => {
+                    const swatchHex = swatchColors[i].toLowerCase();
+                    if (swatchHex === selectedHex) {
+                        s.classList.add('selected');
+                    } else {
+                        s.classList.remove('selected');
+                    }
+                });
+                
+                try { 
+                    colorInput.value = selectedHex; 
+                } catch (_) {}
+            }
+
+            function rgbToHex(rgb) {
+                const m = rgb.match(/rgba?\((\d+),\s*(\d+),\s*(\d+)\)/i);
+                if (!m) return '#000000';
+                const r = parseInt(m[1]).toString(16).padStart(2, '0');
+                const g = parseInt(m[2]).toString(16).padStart(2, '0');
+                const b = parseInt(m[3]).toString(16).padStart(2, '0');
+                return `#${r}${g}${b}`;
+            }
+
+            // Restore color selection
+            refreshColorUI(getArrowColor());
+
+            // Thickness slider
+            const thicknessTitle = document.createElement('div');
+            thicknessTitle.className = 'tools-section-title';
+            thicknessTitle.textContent = 'thickness';
+            tools.appendChild(thicknessTitle);
+
+            const thicknessRow = document.createElement('div');
+            thicknessRow.className = 'thickness-row';
+            tools.appendChild(thicknessRow);
+
+            const thicknessSlider = document.createElement('input');
+            thicknessSlider.type = 'range';
+            thicknessSlider.className = 'thickness-slider';
+            thicknessSlider.min = '1';
+            thicknessSlider.max = '10';
+            thicknessSlider.value = getLineThickness();
+            
+            const thicknessValue = document.createElement('span');
+            thicknessValue.className = 'thickness-value';
+            thicknessValue.textContent = thicknessSlider.value + 'px';
+
+            thicknessSlider.oninput = function() {
+                const value = parseInt(thicknessSlider.value, 10);
+                setStoredLineThickness(value);
+                thicknessValue.textContent = value + 'px';
+                if (_cursorVisible) renderOverlay();
+                encodeToolStateToUrl();
+            };
+
+            thicknessRow.appendChild(thicknessSlider);
+            thicknessRow.appendChild(thicknessValue);
+
+            // Fadeout time slider
+            const fadeoutTitle = document.createElement('div');
+            fadeoutTitle.className = 'tools-section-title';
+            fadeoutTitle.textContent = 'fadeout time';
+            tools.appendChild(fadeoutTitle);
+
+            const fadeoutRow = document.createElement('div');
+            fadeoutRow.className = 'thickness-row';
+            tools.appendChild(fadeoutRow);
+
+            const fadeoutSlider = document.createElement('input');
+            fadeoutSlider.type = 'range';
+            fadeoutSlider.className = 'thickness-slider';
+            fadeoutSlider.min = '0';
+            fadeoutSlider.max = '30';
+            fadeoutSlider.value = getFadeoutTime();
+            
+            const fadeoutValue = document.createElement('span');
+            fadeoutValue.className = 'thickness-value';
+            fadeoutValue.textContent = fadeoutSlider.value === '0' ? 'never' : fadeoutSlider.value + 's';
+
+            fadeoutSlider.oninput = function() {
+                const value = parseInt(fadeoutSlider.value, 10);
+                setStoredFadeoutTime(value);
+                fadeoutValue.textContent = value === 0 ? 'never' : value + 's';
+                encodeToolStateToUrl();
+            };
+
+            fadeoutRow.appendChild(fadeoutSlider);
+            fadeoutRow.appendChild(fadeoutValue);
+
+            // Draggable behavior
+            makeDraggable(tools, 'uvnote-tools-pos', title);
+
+            return tools;
+        }
+
+        function initTools() {
+            const widget = createToolsWidget();
+            document.body.appendChild(widget);
+        }
+
+        function teardownTools() {
+            const w = document.querySelector('.tools-widget');
+            if (w && w.parentNode) w.parentNode.removeChild(w);
+        }
+
+        // --- Canvas overlay for tools ---
+        let _overlay = null;
+        let _overlayCtx = null;
+        let _overlayContainer = null; // window
+        let _overlayMode = 'single';
+        let _overlayResizeHandler = null;
+        let _overlayScrollHandler = null;
+        let _drawing = null; // current in-progress arrow {x1,y1,x2,y2}
+        let _shapes = []; // committed shapes for current mode
+        let _fadeTimer = null; // timer for fade animation
+        let _urlLoadedTool = false; // track if tool was loaded from URL
+        let _isInitializing = true; // prevent URL updates during initialization
+        let _initialCellStates = {}; // track initial cell states from page load
+
+        function getOverlayStorageKey() { return 'uvnote-shapes'; }
+
+        function loadShapes() {
+            try {
+                const raw = localStorage.getItem(getOverlayStorageKey());
+                _shapes = raw ? JSON.parse(raw) : [];
+            } catch (_) { _shapes = []; }
+        }
+
+        function saveShapes() {
+            try { 
+                localStorage.setItem(getOverlayStorageKey(), JSON.stringify(_shapes));
+                // Always update URL when shapes change
+                encodeToolStateToUrl();
+            } catch (_) {}
+        }
+
+        function updateShapesFade() {
+            const now = Date.now();
+            let needsUpdate = false;
+
+            for (let i = _shapes.length - 1; i >= 0; i--) {
+                const shape = _shapes[i];
+                if (!shape.createdAt) continue; // Skip old shapes without timestamps
+                
+                // Use individual shape's fadeout time, or global if not set
+                const shapesFadeoutSeconds = shape.fadeoutTime !== undefined ? shape.fadeoutTime : getFadeoutTime();
+                
+                // Skip fading if fadeout is disabled for this shape
+                if (shapesFadeoutSeconds === 0) continue;
+                
+                const fadeStartTime = Math.max(0, (shapesFadeoutSeconds - 2) * 1000); // Start fading 2s before end
+                const fadeEndTime = shapesFadeoutSeconds * 1000; // Fully gone after specified time
+                const age = now - shape.createdAt;
+                
+                if (age >= fadeEndTime) {
+                    // Remove completely faded shapes
+                    _shapes.splice(i, 1);
+                    needsUpdate = true;
+                } else if (age >= fadeStartTime) {
+                    // Update opacity for fading shapes
+                    const fadeProgress = (age - fadeStartTime) / (fadeEndTime - fadeStartTime);
+                    const newOpacity = 1 - fadeProgress;
+                    if (Math.abs(shape.opacity - newOpacity) > 0.01) {
+                        shape.opacity = newOpacity;
+                        needsUpdate = true;
+                    }
+                }
+            }
+
+            if (needsUpdate) {
+                saveShapes();
+                renderOverlay();
+                // Update URL to remove faded shapes  
+                encodeToolStateToUrl();
+            }
+        }
+
+        function getContentContainer() { return window; }
+
+        function updateOverlayModeAndContainer() {
+            _overlayContainer = window;
+            _overlayMode = 'single';
+        }
+
+        function updateOverlayBounds() {
+            if (!_overlay) return;
+            if (_overlayContainer === window) {
+                _overlay.style.position = 'fixed';
+                _overlay.style.left = '0px';
+                _overlay.style.top = '0px';
+                _overlay.width = window.innerWidth;
+                _overlay.height = window.innerHeight;
+            } else {
+                const rect = _overlayContainer.getBoundingClientRect();
+                _overlay.style.position = 'fixed';
+                _overlay.style.left = rect.left + 'px';
+                _overlay.style.top = rect.top + 'px';
+                _overlay.width = Math.max(0, Math.floor(rect.width));
+                _overlay.height = Math.max(0, Math.floor(rect.height));
+            }
+            renderOverlay();
+        }
+
+        function containerScrollLeft() {
+            return (_overlayContainer === window) ? (window.scrollX || 0) : (_overlayContainer.scrollLeft || 0);
+        }
+        function containerScrollTop() {
+            return (_overlayContainer === window) ? (window.scrollY || 0) : (_overlayContainer.scrollTop || 0);
+        }
+
+        function toCanvasCoords(clientX, clientY) {
+            const rect = _overlay.getBoundingClientRect();
+            return { x: clientX - rect.left, y: clientY - rect.top };
+        }
+
+        function onPointerDown(e) {
+            const tool = document.body.dataset.tool;
+            if (tool === 'arrow') {
+                startDrawArrow(e);
+            } else if (tool === 'pen') {
+                startDrawPen(e);
+            } else if (tool === 'eraser') {
+                eraseAt(e);
+            } else if (tool === 'spotlight') {
+                startDrawSpotlight(e);
+            }
+        }
+
+        function onPointerMove(e) {
+            // Update cursor position
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _cursorX = pt.x;
+            _cursorY = pt.y;
+            
+            if (!_drawing) {
+                // Just update cursor position and re-render
+                if (_cursorVisible) {
+                    renderOverlay();
+                }
+                return;
+            }
+            
+            if (_drawing.type === 'pen') {
+                moveDrawPen(e);
+            } else if (_drawing.type === 'spotlight') {
+                moveDrawSpotlight(e);
+            } else {
+                moveDrawArrow(e);
+            }
+        }
+        
+        function onPointerEnter(e) {
+            _cursorVisible = document.body.dataset.tool !== 'none';
+            if (_cursorVisible) {
+                renderOverlay();
+            }
+        }
+        
+        function onPointerLeave(e) {
+            _cursorVisible = false;
+            renderOverlay();
+        }
+
+        function onPointerUp(e) {
+            if (!_drawing) return;
+            if (_drawing.type === 'pen') {
+                endDrawPen();
+            } else if (_drawing.type === 'spotlight') {
+                endDrawSpotlight();
+            } else {
+                endDrawArrow();
+            }
+        }
+
+        function startDrawArrow(e) {
+            if (document.body.dataset.tool !== 'arrow') return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _drawing = {
+                x1: pt.x + containerScrollLeft(),
+                y1: pt.y + containerScrollTop(),
+                x2: pt.x + containerScrollLeft(),
+                y2: pt.y + containerScrollTop(),
+                color: getArrowColor(),
+                width: getLineThickness()
+            };
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function moveDrawArrow(e) {
+            if (!_drawing) return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _drawing.x2 = pt.x + containerScrollLeft();
+            _drawing.y2 = pt.y + containerScrollTop();
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function endDrawArrow() {
+            if (!_drawing) return;
+            _shapes.push({ 
+                type: 'arrow', 
+                ..._drawing,
+                createdAt: Date.now(),
+                fadeoutTime: getFadeoutTime(),
+                opacity: 1.0
+            });
+            _drawing = null;
+            saveShapes();
+            renderOverlay();
+        }
+
+        function startDrawPen(e) {
+            if (document.body.dataset.tool !== 'pen') return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _drawing = {
+                type: 'pen',
+                points: [{
+                    x: pt.x + containerScrollLeft(),
+                    y: pt.y + containerScrollTop()
+                }],
+                color: getArrowColor(),
+                width: getLineThickness()
+            };
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function moveDrawPen(e) {
+            if (!_drawing || _drawing.type !== 'pen') return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _drawing.points.push({
+                x: pt.x + containerScrollLeft(),
+                y: pt.y + containerScrollTop()
+            });
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function endDrawPen() {
+            if (!_drawing || _drawing.type !== 'pen') return;
+            if (_drawing.points.length > 1) {
+                _shapes.push({ 
+                    ..._drawing,
+                    createdAt: Date.now(),
+                    fadeoutTime: getFadeoutTime(),
+                    opacity: 1.0
+                });
+            }
+            _drawing = null;
+            saveShapes();
+            renderOverlay();
+        }
+
+        function startDrawSpotlight(e) {
+            if (document.body.dataset.tool !== 'spotlight') return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _drawing = {
+                type: 'spotlight',
+                x: pt.x + containerScrollLeft(),
+                y: pt.y + containerScrollTop(),
+                radius: getLineThickness() * 20, // Use thickness to control spotlight size (bigger default)
+                color: getArrowColor()
+            };
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function moveDrawSpotlight(e) {
+            if (!_drawing || _drawing.type !== 'spotlight') return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            const dx = pt.x + containerScrollLeft() - _drawing.x;
+            const dy = pt.y + containerScrollTop() - _drawing.y;
+            _drawing.radius = Math.max(20, Math.sqrt(dx * dx + dy * dy)); // Minimum radius of 20
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function endDrawSpotlight() {
+            if (!_drawing || _drawing.type !== 'spotlight') return;
+            _shapes.push({ 
+                ..._drawing,
+                createdAt: Date.now(),
+                fadeoutTime: getFadeoutTime(),
+                opacity: 1.0
+            });
+            _drawing = null;
+            saveShapes();
+            renderOverlay();
+        }
+
+        function distPointToSegment(px, py, x1, y1, x2, y2) {
+            const dx = x2 - x1, dy = y2 - y1;
+            if (dx === 0 && dy === 0) return Math.hypot(px - x1, py - y1);
+            const t = Math.max(0, Math.min(1, ((px - x1) * dx + (py - y1) * dy) / (dx*dx + dy*dy)));
+            const cx = x1 + t * dx, cy = y1 + t * dy;
+            return Math.hypot(px - cx, py - cy);
+        }
+
+        function eraseAt(e) {
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            const x = pt.x + containerScrollLeft();
+            const y = pt.y + containerScrollTop();
+            const threshold = 10; // pixels
+            for (let i = _shapes.length - 1; i >= 0; i--) {
+                const s = _shapes[i];
+                if (s.type === 'arrow') {
+                    const d = distPointToSegment(x, y, s.x1, s.y1, s.x2, s.y2);
+                    if (d <= threshold) {
+                        _shapes.splice(i, 1);
+                        saveShapes();
+                        renderOverlay();
+                        break;
+                    }
+                } else if (s.type === 'pen' && s.points) {
+                    // Check if click is near any line segment in the pen stroke
+                    let minDist = Infinity;
+                    for (let j = 1; j < s.points.length; j++) {
+                        const d = distPointToSegment(x, y, s.points[j-1].x, s.points[j-1].y, s.points[j].x, s.points[j].y);
+                        minDist = Math.min(minDist, d);
+                    }
+                    if (minDist <= threshold) {
+                        _shapes.splice(i, 1);
+                        saveShapes();
+                        renderOverlay();
+                        break;
+                    }
+                }
+            }
+            e.preventDefault();
+        }
+
+        function drawArrow(ctx, x1, y1, x2, y2, color, width, opacity = 1.0) {
+            // Set opacity
+            const oldAlpha = ctx.globalAlpha;
+            ctx.globalAlpha = opacity;
+            
+            ctx.strokeStyle = color;
+            ctx.fillStyle = color;
+            ctx.lineWidth = width;
+            ctx.lineCap = 'round';
+            ctx.lineJoin = 'round';
+            
+            // Check if points are too close (initial state)
+            const dx = x2 - x1;
+            const dy = y2 - y1;
+            const distance = Math.sqrt(dx * dx + dy * dy);
+            
+            if (distance < 5) {
+                // Draw just a small arrowhead pointing down-right when first clicked
+                const defaultAngle = Math.PI / 4; // 45 degrees (down-right)
+                const headLength = Math.min(15 + width * 1.5, 25);
+                const headAngle = Math.PI / 6;
+                
+                // Calculate arrowhead points
+                const hx1 = x1 + headLength * Math.cos(defaultAngle - headAngle);
+                const hy1 = y1 + headLength * Math.sin(defaultAngle - headAngle);
+                const hx2 = x1 + headLength * Math.cos(defaultAngle + headAngle);
+                const hy2 = y1 + headLength * Math.sin(defaultAngle + headAngle);
+                
+                // Draw arrowhead only
+                ctx.beginPath();
+                ctx.moveTo(x1, y1);
+                ctx.lineTo(hx1, hy1);
+                ctx.lineTo(hx2, hy2);
+                ctx.closePath();
+                ctx.fill();
+            } else {
+                // Normal arrow drawing - head at x1,y1, tail at x2,y2
+                const angle = Math.atan2(y1 - y2, x1 - x2);
+                const headLength = Math.min(15 + width * 1.5, 25);
+                const headAngle = Math.PI / 6;
+                
+                // Calculate where the line should end (before the arrowhead)
+                const lineEndX = x1 - headLength * 0.8 * Math.cos(angle);
+                const lineEndY = y1 - headLength * 0.8 * Math.sin(angle);
+                
+                // Draw the line from tail to near the head
+                ctx.beginPath();
+                ctx.moveTo(x2, y2);
+                ctx.lineTo(lineEndX, lineEndY);
+                ctx.stroke();
+                
+                // Calculate arrowhead points
+                const hx1 = x1 - headLength * Math.cos(angle - headAngle);
+                const hy1 = y1 - headLength * Math.sin(angle - headAngle);
+                const hx2 = x1 - headLength * Math.cos(angle + headAngle);
+                const hy2 = y1 - headLength * Math.sin(angle + headAngle);
+                
+                // Draw arrowhead
+                ctx.beginPath();
+                ctx.moveTo(x1, y1);
+                ctx.lineTo(hx1, hy1);
+                ctx.lineTo(hx2, hy2);
+                ctx.closePath();
+                ctx.fill();
+            }
+            
+            // Restore opacity
+            ctx.globalAlpha = oldAlpha;
+        }
+
+        function drawPen(ctx, points, color, width, offX, offY, opacity = 1.0) {
+            if (!points || points.length < 2) return;
+            
+            // Set opacity
+            const oldAlpha = ctx.globalAlpha;
+            ctx.globalAlpha = opacity;
+            
+            ctx.strokeStyle = color;
+            ctx.lineWidth = width;
+            ctx.lineCap = 'round';
+            ctx.lineJoin = 'round';
+            ctx.beginPath();
+            ctx.moveTo(points[0].x - offX, points[0].y - offY);
+            for (let i = 1; i < points.length; i++) {
+                ctx.lineTo(points[i].x - offX, points[i].y - offY);
+            }
+            ctx.stroke();
+            
+            // Restore opacity
+            ctx.globalAlpha = oldAlpha;
+        }
+
+        function drawAllSpotlights(ctx, spotlights, offX, offY) {
+            if (!spotlights || spotlights.length === 0) return;
+            
+            ctx.save();
+            
+            // Calculate the overall opacity based on all spotlights
+            const maxOpacity = Math.max(...spotlights.map(s => s.opacity || 1.0));
+            
+            // Fill entire canvas with dark overlay
+            ctx.fillStyle = `rgba(0, 0, 0, ${0.7 * maxOpacity})`;
+            ctx.fillRect(0, 0, ctx.canvas.width, ctx.canvas.height);
+            
+            // Cut out completely transparent holes for all spotlights
+            ctx.globalCompositeOperation = 'destination-out';
+            ctx.fillStyle = 'rgba(0, 0, 0, 1)'; // Solid black to ensure complete removal
+            for (const spotlight of spotlights) {
+                ctx.beginPath();
+                ctx.arc(spotlight.x - offX, spotlight.y - offY, spotlight.radius, 0, 2 * Math.PI);
+                ctx.fill();
+            }
+            
+            ctx.restore();
+        }
+
+        function renderOverlay() {
+            if (!_overlay || !_overlayCtx) return;
+            _overlayCtx.clearRect(0, 0, _overlay.width, _overlay.height);
+            const offX = containerScrollLeft();
+            const offY = containerScrollTop();
+            // Draw non-spotlight shapes first
+            for (const s of _shapes) {
+                const opacity = s.opacity !== undefined ? s.opacity : 1.0;
+                if (s.type === 'arrow') {
+                    drawArrow(_overlayCtx, s.x1 - offX, s.y1 - offY, s.x2 - offX, s.y2 - offY, s.color || '#f00', s.width || 2, opacity);
+                } else if (s.type === 'pen') {
+                    drawPen(_overlayCtx, s.points, s.color || '#f00', s.width || 2, offX, offY, opacity);
+                }
+            }
+            // Draw current drawing (non-spotlight)
+            if (_drawing) {
+                if (_drawing.type === 'pen') {
+                    drawPen(_overlayCtx, _drawing.points, _drawing.color, _drawing.width, offX, offY);
+                } else if (_drawing.type !== 'spotlight') {
+                    drawArrow(_overlayCtx, _drawing.x1 - offX, _drawing.y1 - offY, _drawing.x2 - offX, _drawing.y2 - offY, _drawing.color, _drawing.width);
+                }
+            }
+            
+            // Collect all spotlights (existing + current drawing + cursor preview)
+            const spotlights = [];
+            
+            // Add existing spotlight shapes
+            for (const s of _shapes) {
+                if (s.type === 'spotlight') {
+                    spotlights.push({
+                        x: s.x,
+                        y: s.y, 
+                        radius: s.radius,
+                        opacity: s.opacity !== undefined ? s.opacity : 1.0
+                    });
+                }
+            }
+            
+            // Add current spotlight being drawn
+            if (_drawing && _drawing.type === 'spotlight') {
+                spotlights.push({
+                    x: _drawing.x,
+                    y: _drawing.y,
+                    radius: _drawing.radius,
+                    opacity: 1.0
+                });
+            }
+            
+            // Add cursor preview spotlight if tool is active
+            if (_cursorVisible && !_drawing) {
+                const tool = document.body.dataset.tool;
+                if (tool === 'spotlight') {
+                    const thickness = getLineThickness();
+                    const radius = thickness * 20;
+                    const cursorWorldX = _cursorX + containerScrollLeft();
+                    const cursorWorldY = _cursorY + containerScrollTop();
+                    spotlights.push({
+                        x: cursorWorldX,
+                        y: cursorWorldY,
+                        radius: radius,
+                        opacity: 0.8
+                    });
+                }
+            }
+            
+            // Draw all spotlights as a single overlay with multiple holes
+            drawAllSpotlights(_overlayCtx, spotlights, offX, offY);
+            
+            // Draw cursor indicators for non-spotlight tools
+            if (_cursorVisible && !_drawing) {
+                const tool = document.body.dataset.tool;
+                const color = getArrowColor();
+                const thickness = getLineThickness();
+                
+                if (tool !== 'spotlight') {
+                    _overlayCtx.save();
+                    _overlayCtx.fillStyle = color;
+                    _overlayCtx.globalAlpha = 0.7;
+                    
+                    if (tool === 'eraser') {
+                        // Draw eraser indicator
+                        _overlayCtx.strokeStyle = color;
+                        _overlayCtx.lineWidth = 2;
+                        _overlayCtx.beginPath();
+                        _overlayCtx.arc(_cursorX, _cursorY, 10, 0, 2 * Math.PI);
+                        _overlayCtx.stroke();
+                    } else {
+                        // Draw dot for pen/arrow
+                        _overlayCtx.beginPath();
+                        _overlayCtx.arc(_cursorX, _cursorY, thickness / 2, 0, 2 * Math.PI);
+                        _overlayCtx.fill();
+                    }
+                    
+                    _overlayCtx.restore();
+                }
+            }
+        }
+
+        function setOverlayActive(active) {
+            if (!_overlay) initOverlay();
+            _overlay.style.pointerEvents = active ? 'auto' : 'none';
+            _overlay.style.cursor = active ? 'none' : 'auto';
+            // Re-render to ensure visibility aligns with content
+            renderOverlay();
+        }
+
+        function initOverlay() {
+            if (_overlay) return;
+            updateOverlayModeAndContainer();
+            _overlay = document.createElement('canvas');
+            _overlay.className = 'draw-overlay';
+            _overlayCtx = _overlay.getContext('2d');
+            document.body.appendChild(_overlay);
+            updateOverlayBounds();
+            loadShapes();
+            renderOverlay();
+
+            // Events
+            _overlay.addEventListener('mousedown', onPointerDown);
+            _overlay.addEventListener('mousemove', onPointerMove);
+            _overlay.addEventListener('mouseenter', onPointerEnter);
+            _overlay.addEventListener('mouseleave', onPointerLeave);
+            document.addEventListener('mouseup', onPointerUp);
+            _overlay.addEventListener('touchstart', onPointerDown, { passive: false });
+            _overlay.addEventListener('touchmove', onPointerMove, { passive: false });
+            document.addEventListener('touchend', onPointerUp);
+
+            _overlayResizeHandler = () => updateOverlayBounds();
+            window.addEventListener('resize', _overlayResizeHandler);
+
+            _overlayScrollHandler = () => renderOverlay();
+            window.addEventListener('scroll', _overlayScrollHandler);
+            
+            // Start fade animation timer
+            _fadeTimer = setInterval(updateShapesFade, 100); // Update every 100ms for smooth fade
+        }
+
+        function rebindOverlayContainer() {
+            if (!_overlay) return;
+            // Remove old scroll handler
+            if (_overlayScrollHandler) { window.removeEventListener('scroll', _overlayScrollHandler); }
+            updateOverlayModeAndContainer();
+            updateOverlayBounds();
+            loadShapes();
+            renderOverlay();
+            _overlayScrollHandler = () => renderOverlay();
+            window.addEventListener('scroll', _overlayScrollHandler);
+        }
+
+        function teardownOverlay() {
+            if (!_overlay) return;
+            _overlay.removeEventListener('mousedown', onPointerDown);
+            _overlay.removeEventListener('mousemove', onPointerMove);
+            _overlay.removeEventListener('mouseenter', onPointerEnter);
+            _overlay.removeEventListener('mouseleave', onPointerLeave);
+            document.removeEventListener('mouseup', onPointerUp);
+            _overlay.removeEventListener('touchstart', onPointerDown);
+            _overlay.removeEventListener('touchmove', onPointerMove);
+            document.removeEventListener('touchend', onPointerUp);
+            if (_overlayResizeHandler) window.removeEventListener('resize', _overlayResizeHandler);
+            if (_overlayScrollHandler) {
+                if (_overlayContainer === window) {
+                    window.removeEventListener('scroll', _overlayScrollHandler);
+                } else if (_overlayContainer) {
+                    _overlayContainer.removeEventListener('scroll', _overlayScrollHandler);
+                }
+            }
+            if (_fadeTimer) {
+                clearInterval(_fadeTimer);
+                _fadeTimer = null;
+            }
+            if (_overlay.parentNode) _overlay.parentNode.removeChild(_overlay);
+            _overlay = null; _overlayCtx = null; _overlayContainer = null; _overlayResizeHandler = null; _overlayScrollHandler = null; _drawing = null;
+        }
+        
+        function teardownFileExplorer() {
+            const fe = document.querySelector('.file-explorer');
+            if (fe && fe.parentNode) fe.parentNode.removeChild(fe);
+        }
+
+        function escapeHtml(text) {
+            const div = document.createElement('div');
+            div.textContent = text;
+            return div.innerHTML;
+        }
+
+        function runCell(cellId){
+            const btn=document.querySelector('.run-btn[onclick*="'+cellId+'"]');
+            const output=document.getElementById('output-'+cellId);
+            if(btn){btn.textContent='⏳ running...';btn.disabled=true;}
+            if(output){output.classList.add('output-stale');}
+            fetch('/run/'+cellId,{method:'POST'}).then(r=>r.json()).then(data=>{
+                if(output){
+                    output.classList.remove('output-stale');
+                    let html='';
+                    if (data.stdout) {
+                    html += '<div class="cell-stdout"><pre class="stdout-text">'
+                        + escapeHtml(data.stdout)
+                        + '</pre></div>';
+                    }
+
+                    console.log('UV Logs:', data);
+                    if(data.stderr) {
+                        // Split UV logs from regular stderr
+                        const lines = data.stderr.split('\\n');
+                        let uvLogs = [];
+                        let regularLogs = [];
+                        let inUvSection = true;
+
+                        for (const line of lines) {
+                            if (inUvSection) {
+                                uvLogs.push(line);
+                                if (line.startsWith('Installed ')) {
+                                    inUvSection = false;
+                                }
+                            } else {
+                                regularLogs.push(line);
+                            }
+                        }
+                        
+
+                        // If we never found "Installed", treat it all as regular stderr
+                        if (inUvSection) {
+                            html+='<div class="cell-stderr">'+escapeHtml(data.stderr)+'</div>';
+                        } else {
+                            const uvLogsStr = uvLogs.join('\\n');
+                            const regularLogsStr = regularLogs.join('\\n').trim();
+
+                            if (uvLogsStr) {
+                                html+='<div class="uv-install-logs">';
+                                html+='<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>';
+                                html+='<div class="uv-logs-content" style="display: none;">'+escapeHtml(uvLogsStr)+'</div>';
+                                html+='</div>';
+                            }
+                            if (regularLogsStr) {
+                                html+='<div class="cell-stderr">'+escapeHtml(regularLogsStr)+'</div>';
+                            }
+                        }
+                    }
+                    output.innerHTML=html;
+                }
+                if(btn){btn.textContent='▶ run';btn.disabled=false;}
+            }).catch(e=>{
+                console.error('Run failed:',e);
+                if(output){output.classList.remove('output-stale');}
+                if(btn){btn.textContent='▶ run';btn.disabled=false;}
+            });
+        }
+
+        function copyCell(cellId){
+            // Try multiple selectors to find the code element
+            // Pygments generates .highlight > pre with spans, not wrapped in <code>
+            let codeElement = document.querySelector('#code-'+cellId+' .highlight pre');
+            if (!codeElement) {
+                codeElement = document.querySelector('#code-'+cellId+' pre');
+            }
+            if (!codeElement) {
+                codeElement = document.querySelector('#code-'+cellId+' code');
+            }
+            if (!codeElement) {
+                // Fallback to the code div itself
+                codeElement = document.getElementById('code-'+cellId);
+            }
+
+            const btn = document.querySelector('.copy-btn[onclick*="'+cellId+'"]');
+
+            if (!codeElement) {
+                console.error('Code element not found for cell:', cellId);
+                return;
+            }
+            if (!btn) {
+                console.error('Copy button not found for cell:', cellId);
+                return;
+            }
+
+            const codeText = codeElement.textContent;
+            
+            if (navigator.clipboard && navigator.clipboard.writeText) {
+                navigator.clipboard.writeText(codeText).then(function() {
+                    console.log('Clipboard copy successful');
+                    btn.textContent = '✓ Copied!';
+                    btn.classList.add('copied');
+                    setTimeout(function() {
+                        btn.textContent = 'Copy';
+                        btn.classList.remove('copied');
+                    }, 2000);
+                }).catch(function(err) {
+                    console.warn('Clipboard copy failed:', err);
+                    fallbackCopy();
+                });
+            } else {
+                console.log('Using fallback copy method');
+                fallbackCopy();
+            }
+            
+            function fallbackCopy() {
+                const textarea = document.createElement('textarea');
+                textarea.value = codeText;
+                textarea.style.position = 'absolute';
+                textarea.style.left = '-9999px';
+                document.body.appendChild(textarea);
+                textarea.select();
+                try {
+                    const success = document.execCommand('copy');
+                    console.log('Fallback copy success:', success);
+                    btn.textContent = '✓ Copied!';
+                    btn.classList.add('copied');
+                    setTimeout(function() {
+                        btn.textContent = 'Copy';
+                        btn.classList.remove('copied');
+                    }, 2000);
+                } catch (err) {
+                    console.error('Fallback copy failed:', err);
+                    btn.textContent = 'Copy failed';
+                    setTimeout(function() {
+                        btn.textContent = 'Copy';
+                    }, 2000);
+                }
+                document.body.removeChild(textarea);
+            }
+        }
+
+        // // Live reload functionality (robust SSE handling)
+        // (function(){
+        //     if (!('EventSource' in window)) {
+        //         console.warn('SSE not supported in this browser');
+        //         return;
+        //     }
+        //     let source = new EventSource('/events');
+        //     let isOpen = false;
+        //     source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
+        //     source.onmessage = function(e){
+        //         const msg=(e.data||'').trim(); if(!msg) return;
+        //         console.log('SSE message:', msg);
+        //         if (msg==='reload' || msg==='incremental') { location.reload(); }
+        //         // Ignore 'loading' to avoid premature reload loops
+        //     };
+        //     source.onerror = function(e){
+        //         // Let EventSource auto-reconnect instead of forcing a reload
+        //         if (isOpen) console.warn('SSE error after open, retrying...', e);
+        //     };
+        //     window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
+        // })();
+
+
+        document.addEventListener('DOMContentLoaded', function() {
+            // Capture initial cell states before any modifications
+            captureInitialCellStates();
+            
+            updateThemeIcon();
+            updateUiMenu();
+            updateUiDebug();
+            const widgetsEnabled = (document.documentElement.getAttribute('data-widgets') || 'on') === 'on';
+            if (widgetsEnabled) {
+            initMinimap();
+            initFileExplorer();
+            initTools();
+            initOverlay();
+            initStatusWidget();
+            initializeWidgetVisibility();
+            layoutWidgetsStackedBottomRight();
+            window.addEventListener('resize', layoutWidgetsStackedBottomRight);
+            }
+
+            // Apply deep-link selection if present
+            applyLocationFromUrl();
+            updateStateIndicator();
+            
+            // Apply cell states from URL immediately
+            const url = new URL(window.location.href);
+            const encodedCellStates = url.searchParams.get('cells');
+            if (encodedCellStates) {
+                console.log('Applying cell states from URL...');
+                const cellStates = decodeCellStatesFromUrl(encodedCellStates);
+                
+                // Use requestAnimationFrame to ensure DOM is ready
+                requestAnimationFrame(() => {
+                    applyCellStatesFromUrl(cellStates);
+                    
+                    // Clear initialization flag after cell states are applied
+                    if (typeof _isInitializing !== 'undefined') {
+                        _isInitializing = false;
+                    }
+                });
+            } else {
+                // Clear initialization flag even if no cell states
+                if (typeof _isInitializing !== 'undefined') {
+                    requestAnimationFrame(() => {
+                        _isInitializing = false;
+                    });
+                }
+            }
+
+            // Bind drag selection on line numbers
+            document.addEventListener('mousedown', onLineNumberMouseDown);
+            document.addEventListener('mousemove', onDocMouseMove);
+            document.addEventListener('mouseup', onDocMouseUp);
+
+            // Add ESC key handler to exit tools
+            document.addEventListener('keydown', function(e) {
+                if (e.key === 'Escape' || e.keyCode === 27) {
+                    const currentTool = document.body.dataset.tool;
+                    if (currentTool && currentTool !== 'none') {
+                        // Deactivate the current tool
+                        window.setActiveTool('none');
+                    }
+                    // Also clear any active line selection
+                    clearSelection(true);
+                }
+            });
+        });
+
+        function updateStateIndicator() {
+            try {
+                const el = document.getElementById('status-widget');
+                if (!el) return;
+                const tool = document.body.dataset.tool || 'none';
+                if (tool && tool !== 'none') {
+                    el.textContent = `tool: ${tool} — Esc`;
+                    return;
+                }
+                if (_selection) {
+                    const t = _selection.a === _selection.b ? `L${_selection.a}` : `L${_selection.a}-${_selection.b}`;
+                    el.textContent = `selected: ${t} — Esc`;
+                    return;
+                }
+                el.textContent = 'ready — Esc';
+            } catch (_) {}
+        }
+    </script>
+</head>
+
+
+<body>
+    <div class="controls">
+        <div class="controls-buttons">
+            
+            <a href="index.html" class="back-button">← back</a>
+            
+            <div class="theme-toggle" onclick="toggleTheme()">light</div>
+            <div class="reset-toggle" onclick="resetLayout()">reset</div>
+            <div class="menu-button" onclick="toggleMenu()">
+                menu ▼
+                <div class="menu-dropdown">
+                    <div class="menu-item" onclick="setUiTheme('default')">
+                        <span class="menu-checkbox" id="checkbox-ui-default">☑</span> Theme: default
+                    </div>
+                    <div class="menu-item" onclick="setUiTheme('none')">
+                        <span class="menu-checkbox" id="checkbox-ui-none">☐</span> Theme: none
+                    </div>
+                    <div class="menu-item" onclick="setUiTheme('monocolor')">
+                        <span class="menu-checkbox" id="checkbox-ui-monocolor">☐</span> Theme: monocolor
+                    </div>
+                    <div class="menu-item" onclick="toggleWidget('tools')">
+                        <span class="menu-checkbox" id="checkbox-tools">☐</span> Tools
+                    </div>
+                    <div class="menu-item" onclick="toggleWidget('file-explorer')">
+                        <span class="menu-checkbox" id="checkbox-file-explorer">☐</span> File Explorer
+                    </div>
+                    <div class="menu-item" onclick="toggleWidget('minimap')">
+                        <span class="menu-checkbox" id="checkbox-minimap">☐</span> Table of Contents
+                    </div>
+                    <div class="menu-item" onclick="toggleWidget('status')">
+                        <span class="menu-checkbox" id="checkbox-status">☑</span> Status Indicator
+                    </div>
+                </div>
+            </div>
+        </div>
+    </div>
+    
+    <div class="system-info">
+        <div class="system-info-header">Generated on:</div>
+        <div class="system-info-content">
+            Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
+        </div>
+    </div>
+    
+    <div class="main-content">
+        <h1>PyTorch Native - Deformable DETR</h1>
+<h2>GPU Info</h2>
+<div class="cell" id="cell-nv">
+<div class="cell-header">
+<span class="collapse-indicators">
+<span onclick="toggleCode('nv')" style="cursor: pointer;">▼ code</span> 
+<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
+ <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
+</span> | 
+Cell: nv | 0.23s
+ | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
+<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
+<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
+<a href="https://github.com/huggingface/kernels-benchmarks/blob/main/deformable_detr/impls/torch_deformable_detr.md" target="_blank" class="github-btn">GitHub</a>
+</div>
+<div id="code-nv" class="cell-code" data-lines="2">
+<div class="code-wrap">
+<div class="highlight"><pre><span></span><span class="kn">import</span><span class="w"> </span><span class="nn">subprocess</span>
+<span class="nb">print</span><span class="p">(</span><span class="n">subprocess</span><span class="o">.</span><span class="n">run</span><span class="p">([</span><span class="s2">&quot;nvidia-smi&quot;</span><span class="p">],</span> <span class="n">capture_output</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">text</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span><span class="o">.</span><span class="n">stdout</span><span class="p">)</span>
+</pre></div>
+
+<div class="code-line-highlight" id="line-highlight-nv"></div>
+</div>
+</div>
+<div id="output-nv" class="cell-output">
+<div class="cell-stdout"><pre class="stdout-text">Fri Oct 31 20:13:34 2025       
++-----------------------------------------------------------------------------------------+
+| NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
+|-----------------------------------------+------------------------+----------------------+
+| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
+| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
+|                                         |                        |               MIG M. |
+|=========================================+========================+======================|
+|   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
+| N/A   43C    P0             83W /  350W |       0MiB /  46068MiB |     60%      Default |
+|                                         |                        |                  N/A |
++-----------------------------------------+------------------------+----------------------+
+
++-----------------------------------------------------------------------------------------+
+| Processes:                                                                              |
+|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
+|        ID   ID                                                               Usage      |
+|=========================================================================================|
+|  No running processes found                                                             |
++-----------------------------------------------------------------------------------------+
+
+</pre></div>
+</div>
+</div>
+
+<h2>Deformable DETR Multi-Scale Deformable Attention Benchmark (PyTorch Native)</h2>
+<div class="cell" id="cell-benchmark">
+<div class="cell-header">
+<span class="collapse-indicators">
+<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span> 
+<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
+ <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
+</span> | 
+Cell: benchmark | 5.33s
+ | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
+<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
+<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
+<a href="https://github.com/huggingface/kernels-benchmarks/blob/main/deformable_detr/impls/torch_deformable_detr.md" target="_blank" class="github-btn">GitHub</a>
+</div>
+<div id="code-benchmark" class="cell-code" data-lines="118">
+<div class="code-wrap">
+<div class="highlight"><pre><span></span><span class="c1"># /// script</span>
+<span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
+<span class="c1"># dependencies = [</span>
+<span class="c1">#     &quot;numpy&quot;,</span>
+<span class="c1">#     &quot;torch==2.8.0&quot;,</span>
+<span class="c1">#     &quot;kernels-benchmark-tools&quot;,</span>
+<span class="c1"># ]</span>
+<span class="c1">#</span>
+<span class="c1"># [tool.uv.sources]</span>
+<span class="c1"># kernels-benchmark-tools = { path = &quot;../../../../../tools&quot;, editable = true }</span>
+<span class="c1"># ///</span>
+<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
+<span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
+<span class="kn">from</span><span class="w"> </span><span class="nn">kernels_benchmark_tools</span><span class="w"> </span><span class="kn">import</span> <span class="n">KernelTypeEnum</span><span class="p">,</span> <span class="n">run_benchmark</span>
+
+
+<span class="k">def</span><span class="w"> </span><span class="nf">torch_deformable_detr</span><span class="p">(</span>
+    <span class="n">value</span><span class="p">,</span> <span class="n">spatial_shapes</span><span class="p">,</span> <span class="n">level_start_index</span><span class="p">,</span> <span class="n">sampling_locations</span><span class="p">,</span> <span class="n">attention_weights</span><span class="p">,</span> <span class="n">im2col_step</span><span class="o">=</span><span class="mi">64</span>
+<span class="p">):</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    PyTorch native reference implementation of multi-scale deformable attention.</span>
+<span class="sd">    Uses vectorized bilinear interpolation for reasonable performance.</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+    <span class="n">bs</span><span class="p">,</span> <span class="n">num_queries</span><span class="p">,</span> <span class="n">num_heads</span><span class="p">,</span> <span class="n">num_levels</span><span class="p">,</span> <span class="n">num_points</span><span class="p">,</span> <span class="n">_</span> <span class="o">=</span> <span class="n">sampling_locations</span><span class="o">.</span><span class="n">shape</span>
+    <span class="n">_</span><span class="p">,</span> <span class="n">_</span><span class="p">,</span> <span class="n">_</span><span class="p">,</span> <span class="n">channels</span> <span class="o">=</span> <span class="n">value</span><span class="o">.</span><span class="n">shape</span>
+
+    <span class="n">output</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="n">bs</span><span class="p">,</span> <span class="n">num_queries</span><span class="p">,</span> <span class="n">num_heads</span><span class="p">,</span> <span class="n">channels</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="n">value</span><span class="o">.</span><span class="n">device</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">value</span><span class="o">.</span><span class="n">dtype</span><span class="p">)</span>
+
+    <span class="c1"># Split value tensor by levels</span>
+    <span class="n">value_list</span> <span class="o">=</span> <span class="n">value</span><span class="o">.</span><span class="n">split</span><span class="p">([</span><span class="nb">int</span><span class="p">(</span><span class="n">h</span> <span class="o">*</span> <span class="n">w</span><span class="p">)</span> <span class="k">for</span> <span class="n">h</span><span class="p">,</span> <span class="n">w</span> <span class="ow">in</span> <span class="n">spatial_shapes</span><span class="o">.</span><span class="n">tolist</span><span class="p">()],</span> <span class="n">dim</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
+
+    <span class="c1"># Iterate through each level (can&#39;t avoid this loop easily)</span>
+    <span class="k">for</span> <span class="n">level_idx</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">num_levels</span><span class="p">):</span>
+        <span class="n">h</span><span class="p">,</span> <span class="n">w</span> <span class="o">=</span> <span class="n">spatial_shapes</span><span class="p">[</span><span class="n">level_idx</span><span class="p">]</span><span class="o">.</span><span class="n">tolist</span><span class="p">()</span>
+        <span class="n">value_level</span> <span class="o">=</span> <span class="n">value_list</span><span class="p">[</span><span class="n">level_idx</span><span class="p">]</span>  <span class="c1"># (bs, h*w, num_heads, channels)</span>
+
+        <span class="c1"># Reshape to spatial grid: (bs, num_heads, channels, h, w)</span>
+        <span class="n">value_spatial</span> <span class="o">=</span> <span class="n">value_level</span><span class="o">.</span><span class="n">reshape</span><span class="p">(</span><span class="n">bs</span><span class="p">,</span> <span class="n">h</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">num_heads</span><span class="p">,</span> <span class="n">channels</span><span class="p">)</span><span class="o">.</span><span class="n">permute</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span>
+
+        <span class="c1"># Get sampling locations and weights for this level</span>
+        <span class="c1"># loc: (bs, num_queries, num_heads, num_points, 2)</span>
+        <span class="n">loc</span> <span class="o">=</span> <span class="n">sampling_locations</span><span class="p">[:,</span> <span class="p">:,</span> <span class="p">:,</span> <span class="n">level_idx</span><span class="p">,</span> <span class="p">:,</span> <span class="p">:]</span>
+        <span class="c1"># weight: (bs, num_queries, num_heads, num_points)</span>
+        <span class="n">weight</span> <span class="o">=</span> <span class="n">attention_weights</span><span class="p">[:,</span> <span class="p">:,</span> <span class="p">:,</span> <span class="n">level_idx</span><span class="p">,</span> <span class="p">:]</span>
+
+        <span class="c1"># Convert normalized coordinates to pixel coordinates</span>
+        <span class="c1"># loc[..., 0] is x (width), loc[..., 1] is y (height)</span>
+        <span class="n">x</span> <span class="o">=</span> <span class="n">loc</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="mi">0</span><span class="p">]</span> <span class="o">*</span> <span class="n">w</span> <span class="o">-</span> <span class="mf">0.5</span>  <span class="c1"># (bs, num_queries, num_heads, num_points)</span>
+        <span class="n">y</span> <span class="o">=</span> <span class="n">loc</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="mi">1</span><span class="p">]</span> <span class="o">*</span> <span class="n">h</span> <span class="o">-</span> <span class="mf">0.5</span>
+
+        <span class="c1"># Get integer coordinates for bilinear interpolation</span>
+        <span class="n">x0</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">floor</span><span class="p">(</span><span class="n">x</span><span class="p">)</span><span class="o">.</span><span class="n">long</span><span class="p">()</span>
+        <span class="n">y0</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">floor</span><span class="p">(</span><span class="n">y</span><span class="p">)</span><span class="o">.</span><span class="n">long</span><span class="p">()</span>
+        <span class="n">x1</span> <span class="o">=</span> <span class="n">x0</span> <span class="o">+</span> <span class="mi">1</span>
+        <span class="n">y1</span> <span class="o">=</span> <span class="n">y0</span> <span class="o">+</span> <span class="mi">1</span>
+
+        <span class="c1"># Compute interpolation weights BEFORE clamping (important!)</span>
+        <span class="n">lw</span> <span class="o">=</span> <span class="n">x</span> <span class="o">-</span> <span class="n">x0</span><span class="o">.</span><span class="n">float</span><span class="p">()</span>  <span class="c1"># weight for x direction</span>
+        <span class="n">lh</span> <span class="o">=</span> <span class="n">y</span> <span class="o">-</span> <span class="n">y0</span><span class="o">.</span><span class="n">float</span><span class="p">()</span>  <span class="c1"># weight for y direction</span>
+        <span class="n">hw</span> <span class="o">=</span> <span class="mi">1</span> <span class="o">-</span> <span class="n">lw</span>
+        <span class="n">hh</span> <span class="o">=</span> <span class="mi">1</span> <span class="o">-</span> <span class="n">lh</span>
+
+        <span class="c1"># Create mask for valid sample locations</span>
+        <span class="n">valid</span> <span class="o">=</span> <span class="p">(</span><span class="n">y</span> <span class="o">&gt;</span> <span class="o">-</span><span class="mi">1</span><span class="p">)</span> <span class="o">&amp;</span> <span class="p">(</span><span class="n">x</span> <span class="o">&gt;</span> <span class="o">-</span><span class="mi">1</span><span class="p">)</span> <span class="o">&amp;</span> <span class="p">(</span><span class="n">y</span> <span class="o">&lt;</span> <span class="n">h</span><span class="p">)</span> <span class="o">&amp;</span> <span class="p">(</span><span class="n">x</span> <span class="o">&lt;</span> <span class="n">w</span><span class="p">)</span>
+
+        <span class="c1"># Create masks for each corner being in bounds</span>
+        <span class="n">mask_tl</span> <span class="o">=</span> <span class="p">((</span><span class="n">y0</span> <span class="o">&gt;=</span> <span class="mi">0</span><span class="p">)</span> <span class="o">&amp;</span> <span class="p">(</span><span class="n">x0</span> <span class="o">&gt;=</span> <span class="mi">0</span><span class="p">))</span><span class="o">.</span><span class="n">unsqueeze</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">float</span><span class="p">()</span>
+        <span class="n">mask_tr</span> <span class="o">=</span> <span class="p">((</span><span class="n">y0</span> <span class="o">&gt;=</span> <span class="mi">0</span><span class="p">)</span> <span class="o">&amp;</span> <span class="p">(</span><span class="n">x1</span> <span class="o">&lt;=</span> <span class="n">w</span> <span class="o">-</span> <span class="mi">1</span><span class="p">))</span><span class="o">.</span><span class="n">unsqueeze</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">float</span><span class="p">()</span>
+        <span class="n">mask_bl</span> <span class="o">=</span> <span class="p">((</span><span class="n">y1</span> <span class="o">&lt;=</span> <span class="n">h</span> <span class="o">-</span> <span class="mi">1</span><span class="p">)</span> <span class="o">&amp;</span> <span class="p">(</span><span class="n">x0</span> <span class="o">&gt;=</span> <span class="mi">0</span><span class="p">))</span><span class="o">.</span><span class="n">unsqueeze</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">float</span><span class="p">()</span>
+        <span class="n">mask_br</span> <span class="o">=</span> <span class="p">((</span><span class="n">y1</span> <span class="o">&lt;=</span> <span class="n">h</span> <span class="o">-</span> <span class="mi">1</span><span class="p">)</span> <span class="o">&amp;</span> <span class="p">(</span><span class="n">x1</span> <span class="o">&lt;=</span> <span class="n">w</span> <span class="o">-</span> <span class="mi">1</span><span class="p">))</span><span class="o">.</span><span class="n">unsqueeze</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">float</span><span class="p">()</span>
+
+        <span class="c1"># Clamp coordinates for safe indexing</span>
+        <span class="n">x0_clamped</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">clamp</span><span class="p">(</span><span class="n">x0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="n">w</span> <span class="o">-</span> <span class="mi">1</span><span class="p">)</span>
+        <span class="n">x1_clamped</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">clamp</span><span class="p">(</span><span class="n">x1</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="n">w</span> <span class="o">-</span> <span class="mi">1</span><span class="p">)</span>
+        <span class="n">y0_clamped</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">clamp</span><span class="p">(</span><span class="n">y0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="n">h</span> <span class="o">-</span> <span class="mi">1</span><span class="p">)</span>
+        <span class="n">y1_clamped</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">clamp</span><span class="p">(</span><span class="n">y1</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="n">h</span> <span class="o">-</span> <span class="mi">1</span><span class="p">)</span>
+
+        <span class="c1"># Bilinear interpolation weights for all 4 corners</span>
+        <span class="n">w_tl</span> <span class="o">=</span> <span class="p">(</span><span class="n">hh</span> <span class="o">*</span> <span class="n">hw</span><span class="p">)</span><span class="o">.</span><span class="n">unsqueeze</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span>  <span class="c1"># top-left: (bs, num_queries, num_heads, num_points, 1)</span>
+        <span class="n">w_tr</span> <span class="o">=</span> <span class="p">(</span><span class="n">hh</span> <span class="o">*</span> <span class="n">lw</span><span class="p">)</span><span class="o">.</span><span class="n">unsqueeze</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span>  <span class="c1"># top-right</span>
+        <span class="n">w_bl</span> <span class="o">=</span> <span class="p">(</span><span class="n">lh</span> <span class="o">*</span> <span class="n">hw</span><span class="p">)</span><span class="o">.</span><span class="n">unsqueeze</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span>  <span class="c1"># bottom-left</span>
+        <span class="n">w_br</span> <span class="o">=</span> <span class="p">(</span><span class="n">lh</span> <span class="o">*</span> <span class="n">lw</span><span class="p">)</span><span class="o">.</span><span class="n">unsqueeze</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span>  <span class="c1"># bottom-right</span>
+
+        <span class="c1"># Gather values from the 4 corners using advanced indexing</span>
+        <span class="n">batch_idx</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="n">bs</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="n">value</span><span class="o">.</span><span class="n">device</span><span class="p">)</span><span class="o">.</span><span class="n">view</span><span class="p">(</span><span class="n">bs</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">expand</span><span class="p">(</span><span class="n">bs</span><span class="p">,</span> <span class="n">num_queries</span><span class="p">,</span> <span class="n">num_heads</span><span class="p">,</span> <span class="n">num_points</span><span class="p">)</span>
+        <span class="n">head_idx</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="n">num_heads</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="n">value</span><span class="o">.</span><span class="n">device</span><span class="p">)</span><span class="o">.</span><span class="n">view</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="n">num_heads</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">expand</span><span class="p">(</span><span class="n">bs</span><span class="p">,</span> <span class="n">num_queries</span><span class="p">,</span> <span class="n">num_heads</span><span class="p">,</span> <span class="n">num_points</span><span class="p">)</span>
+
+        <span class="c1"># Gather corner values with clamped indices, then apply corner masks</span>
+        <span class="n">v_tl</span> <span class="o">=</span> <span class="n">value_spatial</span><span class="p">[</span><span class="n">batch_idx</span><span class="p">,</span> <span class="n">head_idx</span><span class="p">,</span> <span class="p">:,</span> <span class="n">y0_clamped</span><span class="p">,</span> <span class="n">x0_clamped</span><span class="p">]</span> <span class="o">*</span> <span class="n">mask_tl</span>
+        <span class="n">v_tr</span> <span class="o">=</span> <span class="n">value_spatial</span><span class="p">[</span><span class="n">batch_idx</span><span class="p">,</span> <span class="n">head_idx</span><span class="p">,</span> <span class="p">:,</span> <span class="n">y0_clamped</span><span class="p">,</span> <span class="n">x1_clamped</span><span class="p">]</span> <span class="o">*</span> <span class="n">mask_tr</span>
+        <span class="n">v_bl</span> <span class="o">=</span> <span class="n">value_spatial</span><span class="p">[</span><span class="n">batch_idx</span><span class="p">,</span> <span class="n">head_idx</span><span class="p">,</span> <span class="p">:,</span> <span class="n">y1_clamped</span><span class="p">,</span> <span class="n">x0_clamped</span><span class="p">]</span> <span class="o">*</span> <span class="n">mask_bl</span>
+        <span class="n">v_br</span> <span class="o">=</span> <span class="n">value_spatial</span><span class="p">[</span><span class="n">batch_idx</span><span class="p">,</span> <span class="n">head_idx</span><span class="p">,</span> <span class="p">:,</span> <span class="n">y1_clamped</span><span class="p">,</span> <span class="n">x1_clamped</span><span class="p">]</span> <span class="o">*</span> <span class="n">mask_br</span>
+
+        <span class="c1"># Bilinear interpolation</span>
+        <span class="n">sampled</span> <span class="o">=</span> <span class="n">w_tl</span> <span class="o">*</span> <span class="n">v_tl</span> <span class="o">+</span> <span class="n">w_tr</span> <span class="o">*</span> <span class="n">v_tr</span> <span class="o">+</span> <span class="n">w_bl</span> <span class="o">*</span> <span class="n">v_bl</span> <span class="o">+</span> <span class="n">w_br</span> <span class="o">*</span> <span class="n">v_br</span>
+
+        <span class="c1"># Apply valid mask (only accumulate if entire sample location is valid)</span>
+        <span class="n">sampled</span> <span class="o">=</span> <span class="n">sampled</span> <span class="o">*</span> <span class="n">valid</span><span class="o">.</span><span class="n">unsqueeze</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">float</span><span class="p">()</span>
+
+        <span class="c1"># Apply attention weights and sum over points</span>
+        <span class="c1"># weight: (bs, num_queries, num_heads, num_points)</span>
+        <span class="c1"># Expand weight: (bs, num_queries, num_heads, num_points, 1)</span>
+        <span class="n">weighted_sampled</span> <span class="o">=</span> <span class="n">sampled</span> <span class="o">*</span> <span class="n">weight</span><span class="o">.</span><span class="n">unsqueeze</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span>
+
+        <span class="c1"># Sum over points: (bs, num_queries, num_heads, channels)</span>
+        <span class="n">output</span> <span class="o">+=</span> <span class="n">weighted_sampled</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="n">dim</span><span class="o">=</span><span class="mi">3</span><span class="p">)</span>
+
+    <span class="c1"># Flatten last two dimensions to match kernel output</span>
+    <span class="k">return</span> <span class="n">output</span><span class="o">.</span><span class="n">reshape</span><span class="p">(</span><span class="n">bs</span><span class="p">,</span> <span class="n">num_queries</span><span class="p">,</span> <span class="n">num_heads</span> <span class="o">*</span> <span class="n">channels</span><span class="p">)</span>
+
+
+<span class="n">run_benchmark</span><span class="p">(</span>
+    <span class="n">kernel_type</span><span class="o">=</span><span class="n">KernelTypeEnum</span><span class="o">.</span><span class="n">DEFORMABLE_DETR</span><span class="p">,</span>
+    <span class="n">impl_name</span><span class="o">=</span><span class="s2">&quot;torch_eager&quot;</span><span class="p">,</span>
+    <span class="n">impl_tags</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;family&quot;</span><span class="p">:</span> <span class="s2">&quot;pytorch&quot;</span><span class="p">,</span> <span class="s2">&quot;backend&quot;</span><span class="p">:</span> <span class="s2">&quot;eager&quot;</span><span class="p">},</span>
+    <span class="n">impl_func</span><span class="o">=</span><span class="n">torch_deformable_detr</span><span class="p">,</span>
+    <span class="n">dtype</span><span class="o">=</span><span class="s2">&quot;float32&quot;</span><span class="p">,</span>
+<span class="p">)</span>
+</pre></div>
+
+<div class="code-line-highlight" id="line-highlight-benchmark"></div>
+</div>
+</div>
+<div id="output-benchmark" class="cell-output">
+<div class="cell-stdout"><pre class="stdout-text">Running deformable_detr benchmark on cuda with 4 workloads.
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_Q100_H8_E256_L4_P4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us      20.095ms      1353.99%      20.095ms      20.095ms             1  
+                                            torch_eager        21.57%       4.703ms        99.97%      21.796ms      21.796ms       0.000us         0.00%       1.485ms       1.485ms             1  
+                                            aten::index         4.62%       1.006ms        16.78%       3.660ms      76.241us     237.342us        15.99%     371.712us       7.744us            48  
+                                            aten::copy_         4.87%       1.061ms        11.32%       2.469ms      11.275us     365.385us        24.62%     365.385us       1.668us           219  
+                                              aten::mul         5.80%       1.265ms         9.92%       2.163ms      11.267us     294.264us        19.83%     294.264us       1.533us           192  
+void at::native::index_elementwise_kernel&lt;128, 4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     237.342us        15.99%     237.342us       4.945us            48  
+                                               aten::to         0.67%     145.268us        11.20%       2.441ms      14.275us       0.000us         0.00%     231.015us       1.351us           171  
+                                         aten::_to_copy         2.25%     489.538us        10.53%       2.296ms      18.665us       0.000us         0.00%     231.015us       1.878us           123  
+void at::native::elementwise_kernel&lt;128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     202.558us        13.65%     202.558us       1.688us           120  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     167.074us        11.26%     167.074us       1.989us            84  
+                                       aten::contiguous         0.40%      86.639us         8.70%       1.898ms      19.769us       0.000us         0.00%     134.370us       1.400us            96  
+                                            aten::clone         0.85%     185.683us         8.31%       1.811ms      18.866us       0.000us         0.00%     134.370us       1.400us            96  
+void at::native::elementwise_kernel&lt;128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     134.370us         9.05%     134.370us       1.400us            96  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     115.390us         7.77%     115.390us       1.202us            96  
+                                          aten::__and__         0.63%     137.184us         4.49%     979.904us      11.666us       0.000us         0.00%     100.670us       1.198us            84  
+                                      aten::bitwise_and         2.39%     521.552us         3.87%     842.720us      10.032us     100.670us         6.78%     100.670us       1.198us            84  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     100.670us         6.78%     100.670us       1.198us            84  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      85.858us         5.78%      85.858us       1.192us            72  
+                                              aten::sub         2.24%     488.685us         3.68%     801.476us      11.132us      78.884us         5.32%      78.884us       1.096us            72  
+                                              aten::add         1.55%     338.597us         2.59%     564.753us       9.413us      74.082us         4.99%      74.082us       1.235us            60  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 21.803ms
+Self CUDA time total: 1.484ms
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_Q300_H8_E256_L4_P4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us      18.852ms      1182.31%      18.852ms      18.852ms             1  
+                                            torch_eager        20.99%       4.304ms        99.97%      20.495ms      20.495ms       0.000us         0.00%       1.595ms       1.595ms             1  
+                                            aten::index         4.61%     945.020us        16.80%       3.444ms      71.750us     251.167us        15.75%     382.850us       7.976us            48  
+                                            aten::copy_         5.04%       1.033ms        11.78%       2.414ms      11.023us     364.991us        22.89%     364.991us       1.667us           219  
+                                              aten::mul         5.94%       1.218ms        10.22%       2.095ms      10.911us     359.138us        22.52%     359.138us       1.871us           192  
+void at::native::elementwise_kernel&lt;128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     267.618us        16.78%     267.618us       2.230us           120  
+void at::native::index_elementwise_kernel&lt;128, 4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     251.167us        15.75%     251.167us       5.233us            48  
+                                               aten::to         0.59%     120.975us        11.17%       2.290ms      13.390us       0.000us         0.00%     233.308us       1.364us           171  
+                                         aten::_to_copy         2.01%     411.895us        10.58%       2.169ms      17.632us       0.000us         0.00%     233.308us       1.897us           123  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     168.797us        10.59%     168.797us       2.009us            84  
+                                       aten::contiguous         0.41%      84.261us         8.87%       1.818ms      18.936us       0.000us         0.00%     131.683us       1.372us            96  
+                                            aten::clone         0.84%     172.318us         8.46%       1.734ms      18.058us       0.000us         0.00%     131.683us       1.372us            96  
+void at::native::elementwise_kernel&lt;128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     131.683us         8.26%     131.683us       1.372us            96  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     118.123us         7.41%     118.123us       1.230us            96  
+                                          aten::__and__         0.40%      81.276us         4.41%     903.196us      10.752us       0.000us         0.00%     104.833us       1.248us            84  
+                                      aten::bitwise_and         2.46%     504.088us         4.01%     821.920us       9.785us     104.833us         6.57%     104.833us       1.248us            84  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     104.833us         6.57%     104.833us       1.248us            84  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     104.190us         6.53%     104.190us       1.447us            72  
+                                              aten::add         1.62%     331.582us         2.72%     557.857us       9.298us      91.491us         5.74%      91.491us       1.525us            60  
+                                              aten::sub         2.17%     445.533us         3.70%     758.959us      10.541us      80.509us         5.05%      80.509us       1.118us            72  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 20.501ms
+Self CUDA time total: 1.595ms
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_Q100_H8_E256_L4_P4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us      18.792ms      1222.95%      18.792ms      18.792ms             1  
+                                            torch_eager        21.02%       4.299ms        99.97%      20.449ms      20.449ms       0.000us         0.00%       1.538ms       1.538ms             1  
+                                            aten::index         4.62%     944.347us        16.78%       3.432ms      71.497us     243.904us        15.87%     378.785us       7.891us            48  
+                                            aten::copy_         5.14%       1.051ms        11.72%       2.396ms      10.942us     368.961us        24.01%     368.961us       1.685us           219  
+                                              aten::mul         5.96%       1.219ms        10.23%       2.092ms      10.898us     325.334us        21.17%     325.334us       1.694us           192  
+void at::native::index_elementwise_kernel&lt;128, 4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     243.904us        15.87%     243.904us       5.081us            48  
+void at::native::elementwise_kernel&lt;128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     234.457us        15.26%     234.457us       1.954us           120  
+                                               aten::to         0.61%     125.558us        11.02%       2.255ms      13.184us       0.000us         0.00%     234.080us       1.369us           171  
+                                         aten::_to_copy         1.92%     392.900us        10.41%       2.129ms      17.309us       0.000us         0.00%     234.080us       1.903us           123  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     169.246us        11.01%     169.246us       2.015us            84  
+                                       aten::contiguous         0.42%      85.559us         8.81%       1.802ms      18.772us       0.000us         0.00%     134.881us       1.405us            96  
+                                            aten::clone         0.80%     164.449us         8.39%       1.717ms      17.880us       0.000us         0.00%     134.881us       1.405us            96  
+void at::native::elementwise_kernel&lt;128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     134.881us         8.78%     134.881us       1.405us            96  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     115.650us         7.53%     115.650us       1.205us            96  
+                                          aten::__and__         0.39%      78.814us         4.36%     891.116us      10.609us       0.000us         0.00%     101.539us       1.209us            84  
+                                      aten::bitwise_and         2.44%     499.687us         3.97%     812.302us       9.670us     101.539us         6.61%     101.539us       1.209us            84  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     101.539us         6.61%     101.539us       1.209us            84  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      96.065us         6.25%      96.065us       1.334us            72  
+                                              aten::add         1.62%     331.717us         2.71%     554.333us       9.239us      83.900us         5.46%      83.900us       1.398us            60  
+                                              aten::sub         2.21%     451.413us         3.69%     755.537us      10.494us      79.361us         5.16%      79.361us       1.102us            72  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 20.454ms
+Self CUDA time total: 1.537ms
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_Q300_H8_E256_L4_P4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us      19.115ms      1086.36%      19.115ms      19.115ms             1  
+                                            torch_eager        21.90%       4.346ms        99.98%      19.842ms      19.842ms       0.000us         0.00%       1.761ms       1.761ms             1  
+                                              aten::mul         6.18%       1.226ms        10.60%       2.104ms      10.960us     450.887us        25.63%     450.887us       2.348us           192  
+                                            aten::index         4.92%     977.403us        17.78%       3.530ms      73.537us     282.433us        16.05%     420.451us       8.759us            48  
+                                            aten::copy_         5.20%       1.031ms        12.05%       2.392ms      10.922us     372.637us        21.18%     372.637us       1.702us           219  
+void at::native::elementwise_kernel&lt;128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     357.955us        20.34%     357.955us       2.983us           120  
+void at::native::index_elementwise_kernel&lt;128, 4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     282.433us        16.05%     282.433us       5.884us            48  
+                                               aten::to         0.65%     128.684us        11.66%       2.315ms      13.536us       0.000us         0.00%     234.619us       1.372us           171  
+                                         aten::_to_copy         2.23%     442.466us        11.01%       2.186ms      17.772us       0.000us         0.00%     234.619us       1.907us           123  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     170.397us         9.68%     170.397us       2.029us            84  
+                                       aten::contiguous         0.44%      87.582us         9.26%       1.837ms      19.140us       0.000us         0.00%     138.018us       1.438us            96  
+                                            aten::clone         0.85%     168.452us         8.82%       1.750ms      18.228us       0.000us         0.00%     138.018us       1.438us            96  
+void at::native::elementwise_kernel&lt;128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     138.018us         7.84%     138.018us       1.438us            96  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     129.055us         7.33%     129.055us       1.792us            72  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     117.244us         6.66%     117.244us       1.221us            96  
+                                              aten::add         1.68%     334.180us         2.81%     557.305us       9.288us     113.660us         6.46%     113.660us       1.894us            60  
+                                          aten::__and__         0.41%      80.800us         4.55%     902.601us      10.745us       0.000us         0.00%     105.726us       1.259us            84  
+                                      aten::bitwise_and         2.56%     508.561us         4.14%     821.801us       9.783us     105.726us         6.01%     105.726us       1.259us            84  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     105.726us         6.01%     105.726us       1.259us            84  
+                                              aten::sub         2.25%     446.108us         3.80%     754.277us      10.476us      82.273us         4.68%      82.273us       1.143us            72  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 19.847ms
+Self CUDA time total: 1.760ms
+
+
+impl                     wl                  p50(ms)  ok
+torch_eager              cuda_B1_Q100_H8_E256_L4_P4     3.39  True
+torch_eager              cuda_B1_Q300_H8_E256_L4_P4     4.01  True
+torch_eager              cuda_B2_Q100_H8_E256_L4_P4     4.02  True
+torch_eager              cuda_B2_Q300_H8_E256_L4_P4     4.02  True
+</pre></div>
+<div class="cell-artifacts">
+<h4>Artifacts:</h4>
+<a href="artifacts/benchmark/deformable_detr.jsonl" class="artifact" target="_blank">deformable_detr.jsonl</a>
+</div>
+</div>
+</div>
+    </div>
+    
+</body>
+</html>
\ No newline at end of file
diff --git a/deformable_detr/index.html b/deformable_detr/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..8e1ad7fe09342a610e525c8bba679a7f74857855
--- /dev/null
+++ b/deformable_detr/index.html
@@ -0,0 +1,89 @@
+<!DOCTYPE html>
+<html>
+<head>
+  <meta charset='UTF-8'>
+  <meta name='viewport' content='width=device-width, initial-scale=1.0'>
+  <title>Index of /deformable_detr</title>
+  <style>
+    :root {
+      --bg-primary: #0a0a0a;
+      --bg-secondary: #121212;
+      --bg-tertiary: #181818;
+      --text-primary: #e0e0e0;
+      --text-secondary: #888888;
+      --text-link: #64b5f6;
+      --border-primary: #2a2a2a;
+    }
+    body {
+      font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
+      background: var(--bg-primary);
+      color: var(--text-primary);
+      margin: 0;
+      padding: 16px;
+      max-width: 900px;
+      margin: 0 auto;
+    }
+    .controls {
+      display: flex;
+      justify-content: flex-end;
+      margin-bottom: 1rem;
+    }
+    .back-button {
+      background: var(--bg-secondary);
+      border: 1px solid var(--border-primary);
+      padding: 8px 12px;
+      border-radius: 4px;
+      color: var(--text-secondary);
+      cursor: pointer;
+      font-size: 0.9rem;
+      text-decoration: none;
+      display: inline-block;
+    }
+    .back-button:hover {
+      color: var(--text-primary);
+      background: var(--bg-tertiary);
+    }
+    h1 {
+      font-size: 1.5em;
+      margin: 1rem 0;
+      color: var(--text-primary);
+      border-bottom: 1px solid var(--border-primary);
+      padding-bottom: 0.5rem;
+    }
+    ul {
+      list-style-type: none;
+      padding: 0;
+    }
+    li {
+      margin: 0;
+      border-bottom: 1px solid var(--border-primary);
+    }
+    li:last-child {
+      border-bottom: none;
+    }
+    a {
+      display: block;
+      padding: 0.75rem 0.5rem;
+      text-decoration: none;
+      color: var(--text-link);
+      transition: background 0.2s ease;
+    }
+    a:hover {
+      background: var(--bg-secondary);
+    }
+    .dir {
+      font-weight: 500;
+    }
+  </style>
+</head>
+<body>
+  <div class='controls'>
+    <a href='../index.html' class='back-button'>← back</a>
+  </div>
+  <h1>Index of /deformable_detr</h1>
+  <ul>
+    <li><a href='impls/index.html' class='dir'>impls/</a></li>
+    <li><a href='results/index.html' class='dir'>results/</a></li>
+  </ul>
+</body>
+</html>
\ No newline at end of file
diff --git a/deformable_detr/results/artifacts/combine/latency.svg b/deformable_detr/results/artifacts/combine/latency.svg
new file mode 100644
index 0000000000000000000000000000000000000000..cfe61b52935bc93cabc302ceb7b7fc02981aa5f7
--- /dev/null
+++ b/deformable_detr/results/artifacts/combine/latency.svg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b38828b5c85834f31812d3f314ebdc3cc2e8481610a6d31b84a4f9b0ad78c0f2
+size 17800
diff --git a/deformable_detr/results/cells/combine.py b/deformable_detr/results/cells/combine.py
new file mode 100644
index 0000000000000000000000000000000000000000..973c7b81cc8cea6af69ab5e32268c4e63e71c8bb
--- /dev/null
+++ b/deformable_detr/results/cells/combine.py
@@ -0,0 +1,26 @@
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "numpy",
+#     "torch==2.8.0",
+#     "kernels-benchmark-tools",
+#     "matplotlib",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+from kernels_benchmark_tools.core.visuals import generate_combined_results
+
+# Map display names to uvnote environment variables
+cache_env_map = {
+    "HF Kernels Deformable DETR": "UVNOTE_FILE_HF_KERNELS_DEFORMABLE_DETR_BENCHMARK",
+    "PyTorch Deformable DETR": "UVNOTE_FILE_TORCH_DEFORMABLE_DETR_BENCHMARK",
+}
+
+# Generate combined results with visualization
+generate_combined_results(
+    cache_env_map=cache_env_map,
+    output_filename="deformable_detr.jsonl",
+    svg_filename="latency.svg"
+)
\ No newline at end of file
diff --git a/deformable_detr/results/combined_results.html b/deformable_detr/results/combined_results.html
new file mode 100644
index 0000000000000000000000000000000000000000..a985624a2d9079877fe0cd1dcdefc5494402713c
--- /dev/null
+++ b/deformable_detr/results/combined_results.html
@@ -0,0 +1,4805 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Deformable DETR Benchmark - Combined Results</title>
+
+    <link rel="preconnect" href="https://fonts.googleapis.com">
+    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+    <link href="https://fonts.googleapis.com/css2?family=JetBrains+Mono:ital,wght@0,100..800;1,100..800&display=swap" rel="stylesheet">
+
+    <script>
+// Iframe-friendly navigation router
+        (function() {
+            const isIframe = window.self !== window.top;
+            if (!isIframe) return; // Only activate in iframe context
+
+            // On load: if hash points to a different page, navigate there
+            const hash = window.location.hash;
+            if (hash && hash.startsWith('#/')) {
+                const targetPath = hash.slice(2); // Remove '#/'
+                const currentPath = window.location.pathname.split('/').pop();
+
+                // Only navigate if we're not already on the target page
+                if (targetPath !== currentPath) {
+                    window.location.href = targetPath;
+                    return; // Stop execution, we're navigating away
+                }
+            }
+
+            // Intercept all link clicks for hash-based navigation
+            document.addEventListener('click', function(e) {
+                const link = e.target.closest('a');
+                if (!link) return;
+
+                const href = link.getAttribute('href');
+
+                // Skip external links, anchors, and javascript: links
+                if (!href || href.startsWith('#') || href.startsWith('http') || href.startsWith('javascript:')) {
+                    return;
+                }
+
+                e.preventDefault();
+
+                // Convert relative/absolute path to hash-based navigation
+                const url = new URL(href, window.location.href);
+                let fullPath = url.pathname;
+
+                // Remove leading slash if present for cleaner paths
+                if (fullPath.startsWith('/')) {
+                    fullPath = fullPath.slice(1);
+                }
+
+                // Update parent URL hash
+                window.location.hash = '#/' + fullPath;
+
+                // For HTML files, navigate within iframe
+                if (fullPath.endsWith('.html') || fullPath.endsWith('/')) {
+                    const pathParts = fullPath.split('/').filter(p => p);
+                    const targetFile = pathParts[pathParts.length - 1] || 'index.html';
+                    window.location.href = targetFile;
+                } else {
+                    // For non-HTML files (raw .py, etc), open directly
+                    window.open(href, '_blank');
+                }
+            });
+        })();
+
+        // Apply theme and widget visibility immediately to prevent flicker
+        (function() {
+            const configTheme = 'dark';
+            const hasConfigUi = false;
+            const configUi = hasConfigUi ? 'None' : null;
+            const hasWidgetsConfig = false;
+            const widgetsOn = hasWidgetsConfig ? false : true;
+            let theme;
+            if (configTheme === 'auto') {
+                theme = window.matchMedia('(prefers-color-scheme: dark)').matches ? 'dark' : 'light';
+            } else {
+                theme = localStorage.getItem('uvnote-theme') || configTheme;
+            }
+            document.documentElement.setAttribute('data-theme', theme);
+
+            // Initialize UI theme (css theme)
+            let ui = hasConfigUi ? configUi : (localStorage.getItem('uvnote-ui') || 'default');
+            if (ui !== 'default' && ui !== 'none' && ui !== 'monocolor') { ui = 'default'; }
+            document.documentElement.setAttribute('data-ui', ui);
+
+            // Apply widgets visibility
+            document.documentElement.setAttribute('data-widgets', widgetsOn ? 'on' : 'off');
+        })();
+    </script>
+    <style>
+:root[data-theme="light"] {
+    --bg-primary: #ffffff;
+    --bg-secondary: #f6f8fa;
+    --bg-tertiary: #f8f9fa;
+    --bg-code: #f8f9fa;
+    --bg-error: #fdf2f2;
+    --bg-artifact: #e6f3ff;
+    --bg-artifact-hover: #d0e7ff;
+
+    --text-primary: #333;
+    --text-secondary: #656d76;
+    --text-error: #c53030;
+    --text-link: #0969da;
+
+    --border-primary: #e1e5e9;
+    --border-error: #e53e3e;
+    --border-cell-failed: #d73a49;
+
+    --shadow: rgba(0, 0, 0, 0.1);
+}
+
+:root[data-theme="dark"] {
+    --bg-primary: #0a0a0a;
+    --bg-secondary: #121212;
+    --bg-tertiary: #181818;
+    --bg-code: #0d0d0d;
+    --bg-error: #1a0f0f;
+    --bg-artifact: #151515;
+    --bg-artifact-hover: #1a1a1a;
+
+    --text-primary: #e0e0e0;
+    --text-secondary: #888888;
+    --text-error: #ff6b6b;
+    --text-link: #64b5f6;
+
+    --border-primary: #2a2a2a;
+    --border-error: #ff6b6b;
+    --border-cell-failed: #ff6b6b;
+
+    --shadow: rgba(255, 255, 255, 0.05);
+}
+
+/* Monocolor UI theme: black/white background, all text/borders single blue */
+:root[data-ui="monocolor"] {
+    --mono-color: #0a66ff;
+}
+
+:root[data-ui="monocolor"][data-theme="light"] {
+    --bg-primary: #ffffff;
+}
+
+:root[data-ui="monocolor"][data-theme="dark"] {
+    --bg-primary: #000000;
+}
+
+:root[data-ui="monocolor"] {
+    --bg-secondary: var(--bg-primary);
+    --bg-tertiary: var(--bg-primary);
+    --bg-code: var(--bg-primary);
+    --bg-error: var(--bg-primary);
+    --bg-artifact: var(--bg-primary);
+    --bg-artifact-hover: var(--bg-primary);
+
+    --text-primary: var(--mono-color);
+    --text-secondary: var(--mono-color);
+    --text-error: var(--mono-color);
+    --text-link: var(--mono-color);
+
+    --border-primary: var(--mono-color);
+    --border-error: var(--mono-color);
+    --border-cell-failed: var(--mono-color);
+
+    --shadow: none;
+}
+
+:root[data-ui="monocolor"] a {
+    color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .menu-button,
+:root[data-ui="monocolor"] .theme-toggle,
+:root[data-ui="monocolor"] .reset-toggle,
+:root[data-ui="monocolor"] .back-button {
+    background: var(--bg-primary);
+    color: var(--mono-color);
+    border-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .menu-button:hover,
+:root[data-ui="monocolor"] .theme-toggle:hover,
+:root[data-ui="monocolor"] .reset-toggle:hover,
+:root[data-ui="monocolor"] .back-button:hover {
+    background: var(--bg-primary);
+    color: var(--mono-color);
+    border-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .menu-dropdown {
+    background: var(--bg-primary);
+    border-color: var(--mono-color);
+    box-shadow: none;
+}
+
+:root[data-ui="monocolor"] .menu-item {
+    color: var(--mono-color);
+    border-bottom-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .system-info {
+    background: var(--bg-primary);
+    border-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .cell {
+    border-color: var(--mono-color);
+    background: var(--bg-primary);
+}
+
+:root[data-ui="monocolor"] .cell-header {
+    background: var(--bg-primary);
+    border-bottom-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .artifact {
+    background: var(--bg-primary);
+    border-color: var(--mono-color);
+    color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .artifact:hover {
+    background: var(--bg-primary);
+}
+
+:root[data-ui="monocolor"] .artifact-preview img,
+:root[data-ui="monocolor"] .artifact-preview svg {
+    border-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .status-widget {
+    background: var(--bg-primary);
+    border-color: var(--mono-color);
+    color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .minimap,
+:root[data-ui="monocolor"] .file-explorer,
+:root[data-ui="monocolor"] .tools-widget {
+    background: var(--bg-primary);
+    border-color: var(--mono-color);
+    color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .cell-code {
+    background: var(--bg-primary);
+    border-bottom-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .tools-title,
+:root[data-ui="monocolor"] .file-explorer-section-title,
+:root[data-ui="monocolor"] .minimap-title {
+    color: var(--mono-color);
+    border-bottom-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .tool-button {
+    background: var(--bg-primary);
+    border-color: var(--mono-color);
+    color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .tool-button.active {
+    border-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .file-explorer-item,
+:root[data-ui="monocolor"] .minimap-item {
+    color: var(--mono-color);
+}
+
+/* Force Pygments code to mono blue on mono bg */
+:root[data-ui="monocolor"] .highlight {
+    background: var(--bg-primary) !important;
+    color: var(--mono-color) !important;
+}
+
+:root[data-ui="monocolor"] .highlight *,
+:root[data-ui="monocolor"] .highlight .hll {
+    color: var(--mono-color) !important;
+    background: transparent !important;
+    border-color: var(--mono-color) !important;
+}
+
+/* Default code font + metrics (overridable via frontmatter) */
+:root {
+    --code-font-size: 0.95rem;
+    --code-line-height: 1.5;
+    --code-pad-y: 0.75rem;
+}
+
+/* Minimal UI theme overrides base variables for a flatter, 90s look */
+:root[data-ui="none"] {
+    --bg-primary: #ffffff;
+    --bg-secondary: transparent;
+    --bg-tertiary: transparent;
+    --bg-code: #f9f9f9;
+    --bg-error: #fff0f0;
+    --bg-artifact: #f0f7ff;
+    --bg-artifact-hover: #e5f1ff;
+
+    --text-primary: #000000;
+    --text-secondary: #222222;
+    --text-error: #a00000;
+    --text-link: #0000ee;
+
+    --border-primary: #cccccc;
+    --border-error: #cc0000;
+    --border-cell-failed: #cc0000;
+
+    --shadow: none;
+}
+
+html {
+    overscroll-behavior: none;
+}
+
+body {
+    font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace;
+    line-height: 1.4;
+    max-width: 1000px;
+    margin: 0 auto;
+    padding: 15px;
+    color: var(--text-primary);
+    background: var(--bg-primary);
+    transition: background-color 0.2s ease, color 0.2s ease;
+    overscroll-behavior: none;
+}
+
+/* Minimal "none" UI theme overrides */
+:root[data-ui="none"] body {
+    font-family: 'Times New Roman', Times, serif;
+    line-height: 1.5;
+    max-width: 860px;
+    padding: 12px;
+    background: #ffffff;
+    color: #000000;
+    transition: none;
+}
+
+/* Two panel layout removed */
+
+.controls {
+    position: fixed;
+    top: 20px;
+    right: 20px;
+    display: flex;
+    flex-direction: column;
+    align-items: flex-end;
+    gap: 0.25rem;
+    z-index: 1000;
+}
+
+.controls-buttons {
+    display: flex;
+    gap: 0.5rem;
+}
+
+.menu-button {
+    position: relative;
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    padding: 8px 12px;
+    border-radius: 2px;
+    color: var(--text-secondary);
+    cursor: pointer;
+    font-family: inherit;
+    font-size: 0.9rem;
+    user-select: none;
+}
+
+/* Keep default control styling when widgets are enabled, even in minimal UI */
+:root[data-ui="none"][data-widgets="on"] .menu-button,
+:root[data-ui="none"][data-widgets="on"] .theme-toggle,
+:root[data-ui="none"][data-widgets="on"] .reset-toggle,
+:root[data-ui="none"][data-widgets="on"] .back-button {
+    background: #f6f6f6;
+    border: 1px solid #cccccc;
+    color: #222222;
+}
+
+.menu-button:hover {
+    color: var(--text-primary);
+    background: var(--bg-tertiary);
+}
+
+/* Controls state indicator (top-right) */
+/* Status widget (bottom-right) */
+.status-widget {
+    position: fixed;
+    right: 20px;
+    bottom: 20px;
+    width: auto;
+    max-width: 260px;
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    border-radius: 2px;
+    padding: 6px 8px;
+    font-size: 0.8rem;
+    color: var(--text-secondary);
+    z-index: 100;
+}
+
+.status-widget strong {
+    color: var(--text-primary);
+}
+
+:root[data-ui="none"][data-widgets="on"] .status-widget {
+    background: #f6f6f6;
+    border-color: #ccc;
+    color: #222;
+}
+
+:root[data-ui="none"][data-widgets="on"] .menu-button:hover,
+:root[data-ui="none"][data-widgets="on"] .theme-toggle:hover,
+:root[data-ui="none"][data-widgets="on"] .reset-toggle:hover,
+:root[data-ui="none"][data-widgets="on"] .back-button:hover {
+    background: #ededed;
+    border-color: #bbbbbb;
+    color: #000000;
+}
+
+.menu-dropdown {
+    position: absolute;
+    top: 100%;
+    right: 0;
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    border-radius: 4px;
+    box-shadow: 0 4px 12px var(--shadow);
+    min-width: 160px;
+    opacity: 0;
+    visibility: hidden;
+    transform: translateY(-8px);
+    transition: all 0.2s ease;
+    z-index: 1001;
+    margin-top: 4px;
+}
+
+:root[data-ui="none"][data-widgets="on"] .menu-dropdown {
+    background: #ffffff;
+    border: 1px solid #cccccc;
+    box-shadow: none;
+}
+
+.menu-button.active .menu-dropdown {
+    opacity: 1;
+    visibility: visible;
+    transform: translateY(0);
+}
+
+.menu-item {
+    display: block;
+    padding: 8px 12px;
+    color: var(--text-secondary);
+    text-decoration: none;
+    font-size: 0.85rem;
+    border-bottom: 1px solid var(--border-primary);
+    cursor: pointer;
+}
+
+:root[data-ui="none"] .menu-item {
+    color: #000;
+    border-bottom: 1px solid #eee;
+}
+
+.menu-item:last-child {
+    border-bottom: none;
+}
+
+.menu-item:hover {
+    background: var(--bg-tertiary);
+    color: var(--text-primary);
+}
+
+.menu-checkbox {
+    display: inline-block;
+    width: 16px;
+    font-family: monospace;
+    color: var(--text-link);
+}
+
+.theme-toggle,
+.reset-toggle,
+.back-button {
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    padding: 8px 12px;
+    border-radius: 4px;
+    color: var(--text-secondary);
+    cursor: pointer;
+    font-family: inherit;
+    font-size: 0.9rem;
+    user-select: none;
+}
+
+.back-button {
+    text-decoration: none;
+    display: inline-block;
+}
+
+.theme-toggle:hover,
+.reset-toggle:hover,
+.back-button:hover {
+    color: var(--text-primary);
+    background: var(--bg-tertiary);
+}
+
+.system-info {
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    border-radius: 4px;
+    padding: 8px 12px;
+    margin-bottom: 16px;
+    font-size: 0.85em;
+    color: var(--text-secondary);
+}
+
+.system-info-header {
+    font-weight: 600;
+    color: var(--text-primary);
+    margin-bottom: 2px;
+}
+
+.system-info-content {
+    font-family: monospace;
+}
+
+.theme-toggle,
+.reset-toggle {
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    border-radius: 2px;
+    /* padding: 0.4rem 0.6rem; */
+    cursor: pointer;
+    font-family: inherit;
+    font-size: 0.8rem;
+    color: var(--text-secondary);
+    user-select: none;
+    transition: all 0.2s ease;
+    text-transform: lowercase;
+    letter-spacing: 0;
+}
+
+.theme-toggle:hover,
+.reset-toggle:hover {
+    background: var(--bg-tertiary);
+    border-color: var(--text-secondary);
+    color: var(--text-primary);
+}
+
+.minimap {
+    position: fixed;
+    bottom: 20px;
+    right: 20px;
+    width: 220px;
+    max-height: 400px;
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    border-radius: 2px;
+    padding: 0.5rem;
+    font-size: 0.7rem;
+    overflow-y: auto;
+    z-index: 100;
+    opacity: 0.9;
+    transition: opacity 0.2s ease;
+}
+
+/* Hide widgets and controls when disabled via frontmatter */
+:root[data-widgets="off"] .controls,
+:root[data-widgets="off"] .minimap,
+:root[data-widgets="off"] .file-explorer,
+:root[data-widgets="off"] .tools-widget,
+:root[data-widgets="off"] .status-widget {
+    display: none !important;
+}
+
+.file-explorer {
+    position: fixed;
+    bottom: 20px;
+    /* default; JS will stack */
+    right: 20px;
+    left: auto;
+    top: auto;
+    width: 220px;
+    max-height: 400px;
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    border-radius: 2px;
+    padding: 0.5rem;
+    font-size: 0.7rem;
+    overflow-y: auto;
+    z-index: 100;
+    opacity: 0.9;
+    transition: opacity 0.2s ease;
+}
+
+/* Drawing overlay */
+.draw-overlay {
+    position: fixed;
+    top: 0;
+    left: 0;
+    width: 100vw;
+    height: 100vh;
+    z-index: 80;
+    /* under widgets (100) and controls (1000) */
+    display: block;
+    pointer-events: none;
+    /* enabled only when a tool is active */
+}
+
+/* Tools widget */
+.tools-widget {
+    position: fixed;
+    bottom: 20px;
+    /* default; JS will stack */
+    right: 20px;
+    left: auto;
+    top: auto;
+    width: 220px;
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    border-radius: 2px;
+    padding: 0.5rem;
+    font-size: 0.7rem;
+    z-index: 100;
+    opacity: 0.95;
+}
+
+.tools-title {
+    font-weight: bold;
+    color: var(--text-secondary);
+    margin-bottom: 0.5rem;
+    padding-bottom: 0.25rem;
+    border-bottom: 1px solid var(--border-primary);
+    cursor: grab;
+    user-select: none;
+}
+
+.tools-row {
+    display: flex;
+    gap: 0.4rem;
+    flex-wrap: wrap;
+}
+
+.tool-button {
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-primary);
+    border-radius: 2px;
+    padding: 0.25rem 0.4rem;
+    cursor: pointer;
+    color: var(--text-secondary);
+    font-family: inherit;
+    font-size: 0.75rem;
+    user-select: none;
+}
+
+.tool-button:hover {
+    color: var(--text-primary);
+}
+
+.tool-button.active {
+    color: var(--text-primary);
+    border-color: var(--text-secondary);
+    background: var(--bg-secondary);
+}
+
+.minimap:hover,
+.file-explorer:hover {
+    opacity: 1;
+}
+
+.minimap-title {
+    font-weight: bold;
+    color: var(--text-secondary);
+    margin-bottom: 0.5rem;
+    padding-bottom: 0.25rem;
+    border-bottom: 1px solid var(--border-primary);
+    cursor: grab;
+    /* drag handle */
+    user-select: none;
+}
+
+.minimap-item {
+    display: block;
+    color: var(--text-secondary);
+    text-decoration: none;
+    padding: 0.15rem 0;
+    border-left: 2px solid transparent;
+    padding-left: 0.5rem;
+    transition: all 0.2s ease;
+    cursor: pointer;
+}
+
+.minimap-item:hover {
+    color: var(--text-primary);
+    border-left-color: var(--text-secondary);
+}
+
+.minimap-item.active {
+    color: var(--text-primary);
+    border-left-color: var(--text-link);
+}
+
+.minimap-heading {
+    font-weight: normal;
+}
+
+.minimap-heading.h1 {
+    padding-left: 0.5rem;
+}
+
+.minimap-heading.h2 {
+    padding-left: 1rem;
+}
+
+.minimap-heading.h3 {
+    padding-left: 1.5rem;
+}
+
+.minimap-heading.h4 {
+    padding-left: 2rem;
+}
+
+.minimap-heading.h5 {
+    padding-left: 2.5rem;
+}
+
+.minimap-heading.h6 {
+    padding-left: 3rem;
+}
+
+.minimap-cell {
+    color: var(--text-link);
+    opacity: 0.8;
+    font-style: italic;
+}
+
+.minimap-cell:hover {
+    opacity: 1;
+}
+
+.file-explorer-title {
+    font-weight: bold;
+    color: var(--text-secondary);
+    margin-bottom: 0.5rem;
+    padding-bottom: 0.25rem;
+    border-bottom: 1px solid var(--border-primary);
+    cursor: grab;
+    /* drag handle */
+    user-select: none;
+}
+
+.file-explorer-section {
+    margin-bottom: 0.75rem;
+}
+
+.file-explorer-section-title {
+    font-weight: bold;
+    color: var(--text-secondary);
+    font-size: 0.65rem;
+    margin-bottom: 0.25rem;
+    text-transform: uppercase;
+    letter-spacing: 0.5px;
+}
+
+.file-explorer-item {
+    display: block;
+    color: var(--text-secondary);
+    text-decoration: none;
+    padding: 0.1rem 0;
+    margin-left: 0.5rem;
+    transition: color 0.2s ease;
+    cursor: pointer;
+    font-family: monospace;
+}
+
+.file-explorer-item:hover {
+    color: var(--text-primary);
+}
+
+.file-explorer-item.script {
+    color: var(--text-link);
+}
+
+.file-explorer-item.artifact {
+    color: var(--text-secondary);
+    opacity: 0.8;
+}
+
+
+/* Hide widgets on smaller screens */
+@media (max-width: 768px) {
+
+    .minimap,
+    .file-explorer,
+    .tools-widget {
+        display: none;
+    }
+}
+
+.cell {
+    margin: 1rem 0;
+    border: 1px solid var(--border-primary);
+    border-radius: 2px;
+    overflow: hidden;
+    background: var(--bg-secondary);
+}
+
+:root[data-ui="none"] .cell {
+    margin: 1em 0;
+    border: none;
+    background: transparent;
+}
+
+.cell-header {
+    background: var(--bg-secondary);
+    padding: 0.5rem 1rem;
+    border-bottom: 1px solid var(--border-primary);
+    font-family: inherit;
+    font-size: 0.85rem;
+}
+
+:root[data-ui="none"] .cell-header {
+    background: transparent;
+    border: none;
+    padding: 0;
+    font-weight: bold;
+}
+
+:root[data-ui="none"] .cell-content {
+    padding: 0;
+}
+
+:root[data-ui="none"] .copy-button,
+:root[data-ui="none"] .collapse-indicators,
+:root[data-ui="none"] .cell-meta,
+:root[data-ui="none"] .cell-outputs-header {
+    display: none !important;
+}
+
+:root[data-ui="none"] pre,
+:root[data-ui="none"] code {
+    font-family: Menlo, Monaco, 'Courier New', monospace;
+}
+
+:root[data-ui="none"] .code-content pre {
+    background: #f9f9f9;
+    border: 1px solid #ddd;
+    padding: 8px;
+}
+
+:root[data-ui="none"] .output {
+    background: transparent;
+    border: none;
+    padding: 0.25em 0;
+}
+
+color: var(--text-secondary);
+cursor: pointer;
+user-select: none;
+transition: background-color 0.2s ease;
+}
+
+.cell-header:hover {
+    background: var(--bg-tertiary);
+}
+
+.collapse-indicators {
+    color: var(--text-secondary);
+    font-size: 0.8rem;
+    opacity: 0.7;
+}
+
+.collapse-indicators span:hover {
+    color: var(--text-primary);
+    opacity: 1;
+}
+
+.cell-code {
+    display: block;
+    background: var(--bg-code);
+}
+
+.cell-code.collapsed {
+    display: none;
+}
+
+.cell-code pre {
+    margin: 0;
+    padding: 0.75rem;
+    background: var(--bg-code);
+    overflow-x: auto;
+    color: var(--text-primary);
+}
+
+.cell-output {
+    padding: 0.75rem;
+    /* background: var(--bg-primary); */
+    background: var(--bg-secondary);
+}
+
+.cell-output.collapsed {
+    display: none;
+}
+
+.cell-stdout {
+    background: var(--bg-tertiary);
+    padding: 0.75rem;
+    border-radius: 1px;
+    /* margin: 0.25rem 0; */
+    font-family: inherit;
+    font-size: 0.9rem;
+    white-space: pre-wrap;
+    color: var(--text-primary);
+}
+
+.cell-stdout {
+    background: var(--bg-tertiary);
+    padding: 0.75rem;
+    border-radius: 1px;
+    font-family: inherit;
+    font-size: 0.9rem;
+    color: var(--text-primary);
+
+    /* key bits */
+    overflow: auto;
+    /* show scrollbars when needed */
+    max-width: 100%;
+    /* respects whatever layout width you give it */
+}
+
+.cell-stdout .stdout-text {
+    margin: 0;
+    /* reset pre default margin */
+    white-space: pre;
+    /* keep line breaks, NO wrapping */
+    display: inline-block;
+    /* shrink-to-content */
+    min-width: max-content;
+    /* allow very long lines to define intrinsic width */
+    font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
+    tab-size: 2;
+}
+
+.cell-stderr {
+    background: var(--bg-error);
+    border-left: 2px solid var(--border-error);
+    padding: 1rem;
+    margin: 0.5rem 0;
+    font-family: inherit;
+    font-size: 0.9rem;
+    color: var(--text-error);
+    white-space: pre-wrap;
+}
+
+.uv-install-logs {
+    margin: 0.5rem 0;
+}
+
+.uv-logs-header {
+    cursor: pointer;
+    padding: 0.75rem;
+    border-left: 3px solid var(--border-color);
+    font-family: inherit;
+    font-size: 0.85rem;
+    color: var(--text-secondary);
+    user-select: none;
+}
+
+.uv-logs-content {
+    background: var(--bg-secondary);
+    padding: 1rem;
+    border-left: 3px solid var(--border-color);
+    white-space: pre-wrap;
+    font-family: monospace;
+    font-size: 0.85rem;
+    color: var(--text-secondary);
+    overflow-x: auto;
+}
+
+.cell-artifacts {
+    margin: 1rem 0;
+}
+
+.cell-artifacts h4 {
+    margin: 0 0 0.5rem 0;
+    color: var(--text-secondary);
+    font-size: 0.9rem;
+}
+
+.artifact {
+    display: inline-block;
+    background: var(--bg-artifact);
+    padding: 0.25rem 0.5rem;
+    border-radius: 1px;
+    margin: 0.25rem 0.5rem 0.25rem 0;
+    font-family: inherit;
+    font-size: 0.8rem;
+    color: var(--text-link);
+    text-decoration: none;
+    transition: background-color 0.2s ease;
+    border: 1px solid var(--border-primary);
+}
+
+.artifact:hover {
+    background: var(--bg-artifact-hover);
+}
+
+.artifact-preview {
+    margin-top: 1rem;
+}
+
+.artifact-preview img {
+    max-width: 100%;
+    height: auto;
+    border: 1px solid var(--border-primary);
+    border-radius: 1px;
+}
+
+.artifact-preview svg {
+    max-width: 100%;
+    height: auto;
+    border: 1px solid var(--border-primary);
+    border-radius: 1px;
+    display: block;
+}
+
+/* Style SVG text elements */
+.artifact-preview svg g {
+    fill: var(--text-primary) !important;
+}
+
+/* Auto-theme SVG elements */
+.artifact-preview svg {
+    background: transparent;
+}
+
+/* Invert SVG images in dark mode */
+:root[data-theme="dark"] .artifact-preview img[src$=".svg"] {
+    filter: invert(0.9) hue-rotate(180deg);
+}
+
+/* Keep SVG images readable in monocolor mode */
+:root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] {
+    filter: none;
+}
+
+/* CSV table styling */
+.artifact-csv {
+    margin-top: 1rem;
+    overflow-x: auto;
+}
+
+.csv-table {
+    width: 100%;
+    border-collapse: collapse;
+    font-size: 0.9rem;
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    border-radius: 1px;
+}
+
+.csv-table th,
+.csv-table td {
+    padding: 0.5rem 0.75rem;
+    text-align: left;
+    border: 1px solid var(--border-primary);
+}
+
+.csv-table th {
+    background: var(--bg-tertiary);
+    font-weight: 600;
+    color: var(--text-primary);
+}
+
+.csv-table tbody tr:hover {
+    background: var(--bg-artifact-hover);
+}
+
+.artifact-csv-error {
+    margin-top: 1rem;
+    padding: 1rem;
+    background: var(--bg-error);
+    color: var(--text-error);
+    border: 1px solid var(--border-error);
+    border-radius: 1px;
+}
+
+.cell-failed {
+    border-color: var(--border-cell-failed);
+}
+
+.cell-failed .cell-header {
+    background: var(--bg-error);
+    color: var(--text-error);
+}
+
+.cell-commented {
+    opacity: 0.6;
+    border-style: dashed;
+}
+
+.cell-commented .cell-header {
+    background: var(--bg-secondary);
+    color: var(--text-secondary);
+    font-style: italic;
+}
+
+.run-btn {
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-primary);
+    padding: 2px 6px;
+    border-radius: 2px;
+    color: var(--text-secondary);
+    cursor: pointer;
+    font-size: 0.75em;
+    font-family: inherit;
+    margin-left: 4px;
+}
+
+.run-btn:hover {
+    color: var(--text-primary);
+    background: var(--bg-primary);
+}
+
+.run-btn:disabled {
+    opacity: 0.6;
+    cursor: not-allowed;
+}
+
+.copy-btn {
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-primary);
+    padding: 2px 6px;
+    border-radius: 2px;
+    color: var(--text-secondary);
+    cursor: pointer;
+    font-size: 0.75em;
+    font-family: inherit;
+    margin-left: 4px;
+}
+
+.copy-btn:hover {
+    color: var(--text-primary);
+    background: var(--bg-primary);
+}
+
+.copy-btn:disabled {
+    opacity: 0.6;
+    cursor: not-allowed;
+}
+
+.copy-btn.copied {
+    color: #4caf50;
+    background: var(--bg-primary);
+    border-color: #4caf50;
+    transition: all 0.2s ease;
+}
+
+.raw-btn {
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-primary);
+    padding: 2px 6px;
+    border-radius: 2px;
+    color: var(--text-secondary);
+    cursor: pointer;
+    font-size: 0.75em;
+    font-family: inherit;
+    margin-left: 4px;
+    text-decoration: none;
+    display: inline-block;
+}
+
+.raw-btn:hover {
+    color: var(--text-primary);
+    background: var(--bg-primary);
+    text-decoration: none;
+}
+
+.github-btn {
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-primary);
+    padding: 2px 6px;
+    border-radius: 2px;
+    color: var(--text-secondary);
+    cursor: pointer;
+    font-size: 0.75em;
+    font-family: inherit;
+    margin-left: 4px;
+    text-decoration: none;
+    display: inline-block;
+}
+
+.github-btn:hover {
+    color: var(--text-primary);
+    background: var(--bg-primary);
+    text-decoration: none;
+}
+
+.hf-btn {
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-primary);
+    padding: 2px 6px;
+    border-radius: 2px;
+    color: var(--text-secondary);
+    cursor: pointer;
+    font-size: 0.75em;
+    font-family: inherit;
+    margin-left: 4px;
+    text-decoration: none;
+    display: inline-block;
+}
+
+.hf-btn:hover {
+    color: var(--text-primary);
+    background: var(--bg-primary);
+    text-decoration: none;
+}
+
+.output-stale {
+    opacity: 0.5;
+    position: relative;
+}
+
+.output-stale::after {
+    content: '⏳ updating...';
+    position: absolute;
+    top: 8px;
+    right: 8px;
+    background: var(--bg-secondary);
+    padding: 4px 8px;
+    border-radius: 2px;
+    font-size: 0.75em;
+    color: var(--text-secondary);
+    border: 1px solid var(--border-primary);
+}
+
+h1,
+h2,
+h3,
+h4,
+h5,
+h6 {
+    margin-top: 1.5rem;
+    margin-bottom: 0.75rem;
+    color: var(--text-primary);
+}
+
+h1 {
+    margin-top: 0;
+    margin-bottom: 1rem;
+}
+
+p {
+    margin: 0.75rem 0;
+    color: var(--text-primary);
+}
+
+a {
+    color: var(--text-link);
+}
+
+img {
+    max-width: 100%;
+    height: auto;
+    border-radius: 1px;
+    box-shadow: none;
+}
+
+pre,
+code {
+    font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace;
+    font-size: var(--code-font-size);
+}
+
+.code-wrap {
+    position: relative;
+}
+
+.code-line-highlight {
+    display: none;
+    position: absolute;
+    left: 0;
+    right: 0;
+    height: 1.5em;
+    background: rgba(255, 235, 170, 0.35);
+    pointer-events: none;
+    border-left: 3px solid #f4c542;
+}
+
+.line-number {
+    cursor: pointer;
+    text-decoration: none;
+    color: var(--text-secondary);
+    padding: 0 0.25rem;
+}
+
+.line-number.selected {
+    background: rgba(255, 235, 170, 0.4);
+    color: var(--text-primary);
+}
+
+/* Line numbers */
+.highlight-with-lines {
+    display: flex;
+}
+
+.line-numbers {
+    background: var(--bg-tertiary);
+    padding: var(--code-pad-y) 0.5rem;
+    font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace;
+    font-size: var(--code-font-size);
+    line-height: var(--code-line-height);
+    color: var(--text-secondary);
+    user-select: none;
+    text-align: right;
+    border-right: 1px solid var(--border-primary);
+}
+
+.line-numbers .line-number {
+    display: block;
+    line-height: var(--code-line-height);
+}
+
+.highlight-with-lines .highlight {
+    flex: 1;
+}
+
+.highlight .hll {
+    background-color: transparent;
+}
+
+/* don't conflict with our highlight */
+.highlight pre {
+    white-space: pre;
+    margin: 0;
+    padding: var(--code-pad-y) 0.75rem;
+    line-height: var(--code-line-height);
+}
+
+/* Collapsed code styling */
+.cell-code.collapsed {
+    display: none;
+}
+
+.cell-code.expanded {
+    display: block;
+}
+
+    {
+    % if config.collapse_code %
+}
+
+.cell-code {
+    display: none;
+}
+
+    {
+    % else %
+}
+
+.cell-code {
+    display: block;
+    border-bottom: 1px solid var(--border-primary);
+}
+
+    {
+    % endif %
+}
+
+    {
+        {
+        pygments_css
+    }
+}
+
+/* Ensure our code metrics override Pygments defaults */
+.highlight pre {
+    white-space: pre;
+    margin: 0;
+    padding: var(--code-pad-y) 0.75rem !important;
+    line-height: var(--code-line-height) !important;
+    font-size: var(--code-font-size) !important;
+    font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace !important;
+    border: none;
+}
+
+.line-numbers {
+    line-height: var(--code-line-height) !important;
+}
+
+.line-numbers .line-number {
+    line-height: var(--code-line-height) !important;
+}
+
+/* Custom CSS from frontmatter */
+    {
+        {
+        config.custom_css
+    }
+}
+
+    {
+    # Override code font size from frontmatter (accept number as px) #
+}
+
+    {
+    % if config.code_font_size is not none %
+}
+
+    {
+    % if config.code_font_size is string %
+}
+
+:root {
+    --code-font-size: {
+            {
+            config.code_font_size
+        }
+    }
+
+    ;
+}
+
+    {
+    % else %
+}
+
+:root {
+    --code-font-size: {
+            {
+            config.code_font_size
+        }
+    }
+
+    px;
+}
+
+    {
+    % endif %
+}
+
+    {
+    % endif %
+}
+
+/* Cursor for tools */
+body[data-tool="arrow"] .main-content {
+    cursor: url('data:image/svg+xml;utf8,<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="%23e53935" stroke-width="2"><path d="M12 19l7-7 3 3-7 7-3-3z"/><path d="M18 13l-1.5-7.5L2 2l3.5 14.5L13 18l5-5z"/><path d="M2 2l7.586 7.586"/><circle cx="11" cy="11" r="2"/></svg>') 12 12, crosshair;
+}
+
+body[data-tool="pen"] .main-content {
+    cursor: url('data:image/svg+xml;utf8,<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="%23e53935" stroke-width="2"><path d="M12 19l7-7 3 3-7 7-3-3z"/><path d="M18 13l-1.5-7.5L2 2l3.5 14.5L13 18l5-5z"/><circle cx="4" cy="20" r="2" fill="%23e53935"/></svg>') 4 20, pointer;
+}
+
+body[data-tool="eraser"] .main-content {
+    cursor: url('data:image/svg+xml;utf8,<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="%23e53935" stroke-width="2"><path d="M20 20H7l-7-7 7-7h13v14z"/><path d="M13 13l7-7"/><path d="M13 13L9 9"/></svg>') 12 12, auto;
+}
+
+/* Color picker styles */
+.tools-section-title {
+    font-weight: bold;
+    color: var(--text-secondary);
+    font-size: 0.65rem;
+    margin: 0.75rem 0 0.5rem 0;
+    text-transform: uppercase;
+    letter-spacing: 0.5px;
+}
+
+.color-row {
+    display: grid;
+    grid-template-columns: repeat(6, 1fr);
+    gap: 0.25rem;
+    margin-bottom: 0.5rem;
+}
+
+.color-swatch {
+    width: 18px;
+    height: 18px;
+    border: 2px solid var(--border-primary);
+    border-radius: 3px;
+    cursor: pointer;
+    transition: all 0.2s ease;
+    position: relative;
+}
+
+.color-swatch:hover {
+    transform: scale(1.1);
+    border-color: var(--text-secondary);
+}
+
+.color-swatch.selected {
+    border-color: var(--text-primary);
+    box-shadow: 0 0 0 2px var(--text-link);
+}
+
+.color-swatch.selected::after {
+    content: '✓';
+    position: absolute;
+    top: 50%;
+    left: 50%;
+    transform: translate(-50%, -50%);
+    color: white;
+    font-size: 10px;
+    font-weight: bold;
+    text-shadow: 1px 1px 1px black;
+}
+
+.color-input {
+    width: 24px;
+    height: 24px;
+    border: 2px solid var(--border-primary);
+    border-radius: 3px;
+    cursor: pointer;
+    background: none;
+    padding: 0;
+    grid-column: span 2;
+    justify-self: center;
+}
+
+.color-input:hover {
+    border-color: var(--text-secondary);
+}
+
+/* Thickness slider styles */
+.thickness-row {
+    display: flex;
+    align-items: center;
+    gap: 0.5rem;
+    margin-top: 0.75rem;
+}
+
+.thickness-slider {
+    flex: 1;
+    -webkit-appearance: none;
+    appearance: none;
+    height: 4px;
+    background: var(--border-primary);
+    border-radius: 2px;
+    outline: none;
+    opacity: 0.7;
+    transition: opacity 0.2s;
+}
+
+.thickness-slider:hover {
+    opacity: 1;
+}
+
+.thickness-slider::-webkit-slider-thumb {
+    -webkit-appearance: none;
+    appearance: none;
+    width: 12px;
+    height: 12px;
+    background: var(--text-link);
+    border-radius: 50%;
+    cursor: pointer;
+}
+
+.thickness-slider::-moz-range-thumb {
+    width: 12px;
+    height: 12px;
+    background: var(--text-link);
+    border-radius: 50%;
+    cursor: pointer;
+    border: none;
+}
+
+.thickness-value {
+    font-size: 0.7rem;
+    color: var(--text-secondary);
+    min-width: 20px;
+    text-align: right;
+}
+
+.highlight {
+    background: none !important;
+}
+
+/* Loading animations */
+.loading-spinner {
+    display: inline-block;
+    width: 16px;
+    height: 16px;
+    border: 2px solid var(--border-primary);
+    border-radius: 50%;
+    border-top-color: var(--text-link);
+    animation: spin 1s linear infinite;
+    margin-right: 8px;
+    vertical-align: middle;
+}
+
+@keyframes spin {
+    to {
+        transform: rotate(360deg);
+    }
+}
+
+.loading-skeleton {
+    display: inline-block;
+    background: var(--bg-tertiary);
+    background: linear-gradient(90deg,
+            var(--bg-tertiary) 25%,
+            var(--bg-secondary) 50%,
+            var(--bg-tertiary) 75%);
+    background-size: 200% 100%;
+    animation: loading-shimmer 2s ease-in-out infinite;
+    border-radius: 2px;
+    height: 1em;
+    width: 80px;
+    vertical-align: middle;
+}
+
+@keyframes loading-shimmer {
+    0% {
+        background-position: -200% 0;
+    }
+
+    100% {
+        background-position: 200% 0;
+    }
+}
+
+/* Loading state for cell output */
+.cell-output:has(.loading-spinner) {
+    opacity: 0.7;
+    background: var(--bg-secondary);
+    /* border-left: 3px solid var(--text-link); */
+}
+    </style>
+    <script>
+// --- Drag utilities ---
+        function clamp(val, min, max) { return Math.max(min, Math.min(max, val)); }
+
+        function restorePosition(el, storageKey) {
+            try {
+                const raw = localStorage.getItem(storageKey);
+                if (!raw) return;
+                const pos = JSON.parse(raw);
+                if (typeof pos.left === 'number' && typeof pos.top === 'number') {
+                    el.style.left = pos.left + 'px';
+                    el.style.top = pos.top + 'px';
+                    el.style.right = 'auto';
+                    el.style.bottom = 'auto';
+                }
+            } catch (_) {}
+        }
+
+        function savePosition(el, storageKey) {
+            try {
+                const left = parseFloat(el.style.left || 'NaN');
+                const top = parseFloat(el.style.top || 'NaN');
+                if (!Number.isNaN(left) && !Number.isNaN(top)) {
+                    localStorage.setItem(storageKey, JSON.stringify({ left, top }));
+                }
+            } catch (_) {}
+        }
+
+
+        function makeDraggable(el, storageKey, handleEl) {
+            let dragging = false;
+            let startX = 0, startY = 0; // cursor
+            let origLeft = 0, origTop = 0; // element
+
+            const onMove = (e) => {
+                if (!dragging) return;
+                const clientX = e.touches ? e.touches[0].clientX : e.clientX;
+                const clientY = e.touches ? e.touches[0].clientY : e.clientY;
+                const dx = clientX - startX;
+                const dy = clientY - startY;
+                const w = el.offsetWidth;
+                const h = el.offsetHeight;
+                const maxX = window.innerWidth - w;
+                const maxY = window.innerHeight - h;
+                const newLeft = clamp(origLeft + dx, 0, maxX);
+                const newTop = clamp(origTop + dy, 0, maxY);
+                el.style.left = newLeft + 'px';
+                el.style.top = newTop + 'px';
+                el.style.right = 'auto';
+                el.style.bottom = 'auto';
+            };
+
+            const endDrag = () => {
+                if (!dragging) return;
+                dragging = false;
+                document.removeEventListener('mousemove', onMove);
+                document.removeEventListener('mouseup', endDrag);
+                document.removeEventListener('touchmove', onMove);
+                document.removeEventListener('touchend', endDrag);
+                handleEl && (handleEl.style.cursor = 'grab');
+                savePosition(el, storageKey);
+                // ensure no-overlap constraint after a drag
+                try { layoutWidgetsStackedBottomRight(); } catch (_) {}
+            };
+
+            const startDrag = (e) => {
+                // Start from element's current on-screen rect
+                const elRect = el.getBoundingClientRect();
+                el.style.left = elRect.left + 'px';
+                el.style.top = elRect.top + 'px';
+                el.style.right = 'auto';
+                el.style.bottom = 'auto';
+
+                dragging = true;
+                startX = e.touches ? e.touches[0].clientX : e.clientX;
+                startY = e.touches ? e.touches[0].clientY : e.clientY;
+                origLeft = elRect.left;
+                origTop = elRect.top;
+
+                document.addEventListener('mousemove', onMove);
+                document.addEventListener('mouseup', endDrag);
+                document.addEventListener('touchmove', onMove, { passive: false });
+                document.addEventListener('touchend', endDrag);
+                handleEl && (handleEl.style.cursor = 'grabbing');
+                e.preventDefault();
+            };
+
+            (handleEl || el).addEventListener('mousedown', startDrag);
+            (handleEl || el).addEventListener('touchstart', startDrag, { passive: false });
+
+            // Apply any saved position on init
+            restorePosition(el, storageKey);
+        }
+        function toggleCell(cellId) {
+            const codeElement = document.getElementById('code-' + cellId);
+            const outputElement = document.getElementById('output-' + cellId);
+            
+            if (codeElement) {
+                codeElement.classList.toggle('collapsed');
+            }
+            if (outputElement) {
+                outputElement.classList.toggle('collapsed');
+            }
+            
+            updateIndicators(cellId);
+            encodeToolStateToUrl();
+        }
+        
+        function toggleCode(cellId) {
+            const codeElement = document.getElementById('code-' + cellId);
+            if (codeElement) {
+                codeElement.classList.toggle('collapsed');
+                updateIndicators(cellId);
+                encodeToolStateToUrl();
+            }
+        }
+        
+        function toggleOutput(cellId) {
+            const outputElement = document.getElementById('output-' + cellId);
+            if (outputElement) {
+                outputElement.classList.toggle('collapsed');
+                updateIndicators(cellId);
+                encodeToolStateToUrl();
+            }
+        }
+
+        function toggleUvLogs(headerElement) {
+            const contentElement = headerElement.nextElementSibling;
+            if (contentElement) {
+                const isCollapsed = contentElement.style.display === 'none';
+                contentElement.style.display = isCollapsed ? 'block' : 'none';
+                headerElement.textContent = isCollapsed ? '▼ UV Install Logs' : '▶ UV Install Logs';
+
+                // Update the header indicator if it exists
+                const uvLogsDiv = headerElement.parentElement;
+                if (uvLogsDiv && uvLogsDiv.id && uvLogsDiv.id.startsWith('uv-logs-')) {
+                    const cellId = uvLogsDiv.id.replace('uv-logs-', '');
+                    const indicatorElement = document.getElementById('uv-indicator-' + cellId);
+                    if (indicatorElement) {
+                        indicatorElement.textContent = isCollapsed ? '▼ uv-logs' : '▶ uv-logs';
+                    }
+                }
+            }
+        }
+
+        function toggleUvLogsFromHeader(cellId) {
+            const uvLogsElement = document.getElementById('uv-logs-' + cellId);
+            const indicatorElement = document.getElementById('uv-indicator-' + cellId);
+            if (uvLogsElement) {
+                const headerElement = uvLogsElement.querySelector('.uv-logs-header');
+                const contentElement = uvLogsElement.querySelector('.uv-logs-content');
+                if (contentElement && headerElement) {
+                    const isCollapsed = contentElement.style.display === 'none';
+                    contentElement.style.display = isCollapsed ? 'block' : 'none';
+                    headerElement.textContent = isCollapsed ? '▼ UV Install Logs' : '▶ UV Install Logs';
+                    if (indicatorElement) {
+                        indicatorElement.textContent = isCollapsed ? '▼ uv-logs' : '▶ uv-logs';
+                    }
+                }
+            }
+        }
+        
+        function updateIndicators(cellId) {
+            const codeElement = document.getElementById('code-' + cellId);
+            const outputElement = document.getElementById('output-' + cellId);
+            const indicators = document.querySelector(`[onclick*="${cellId}"]`)?.closest('.cell-header')?.querySelector('.collapse-indicators');
+            
+            if (indicators) {
+                const codeCollapsed = codeElement && codeElement.classList.contains('collapsed');
+                const outputCollapsed = outputElement && outputElement.classList.contains('collapsed');
+                
+                const codeIcon = codeCollapsed ? '▶' : '▼';
+                const outputIcon = outputCollapsed ? '▶' : '▼';
+                
+                const codeSpan = indicators.querySelector('[onclick*="toggleCode"]');
+                const outputSpan = indicators.querySelector('[onclick*="toggleOutput"]');
+                
+                if (codeSpan) codeSpan.innerHTML = `${codeIcon} code`;
+                if (outputSpan) outputSpan.innerHTML = `${outputIcon} output`;
+            }
+        }
+        
+        function toggleTheme() {
+            const html = document.documentElement;
+            const currentTheme = html.getAttribute('data-theme');
+            const newTheme = currentTheme === 'dark' ? 'light' : 'dark';
+            html.setAttribute('data-theme', newTheme);
+            localStorage.setItem('uvnote-theme', newTheme);
+            updateThemeIcon();
+            updateUiDebug();
+        }
+        
+        // Two panel code removed
+        
+        function updateThemeIcon() {
+            const theme = document.documentElement.getAttribute('data-theme');
+            const toggle = document.querySelector('.theme-toggle');
+            if (toggle) {
+                toggle.textContent = theme === 'dark' ? 'light' : 'dark';
+            }
+        }
+        function setUiTheme(newUi) {
+            if (newUi !== 'default' && newUi !== 'none' && newUi !== 'monocolor') return;
+            const html = document.documentElement;
+            html.setAttribute('data-ui', newUi);
+            try { localStorage.setItem('uvnote-ui', newUi); } catch (_) {}
+            updateUiMenu();
+            updateUiDebug();
+        }
+        function updateUiMenu() {
+            const ui = document.documentElement.getAttribute('data-ui') || 'default';
+            const checks = {
+                default: document.getElementById('checkbox-ui-default'),
+                none: document.getElementById('checkbox-ui-none'),
+                monocolor: document.getElementById('checkbox-ui-monocolor')
+            };
+            if (checks.default) checks.default.textContent = ui === 'default' ? '☑' : '☐';
+            if (checks.none) checks.none.textContent = ui === 'none' ? '☑' : '☐';
+            if (checks.monocolor) checks.monocolor.textContent = ui === 'monocolor' ? '☑' : '☐';
+        }
+
+        function updateUiDebug() {
+            const ui = document.documentElement.getAttribute('data-ui') || 'default';
+            const color = document.documentElement.getAttribute('data-theme') || 'light';
+            const el = document.getElementById('ui-debug');
+            if (el) {
+                el.textContent = `UI: ${ui} | Color: ${color}`;
+            }
+        }
+
+        // Line selection and deep-linking
+        function clearLineSelections() {
+            try {
+                document.querySelectorAll('.code-line-highlight').forEach(el => { el.style.display = 'none'; });
+                document.querySelectorAll('.line-number.selected').forEach(el => el.classList.remove('selected'));
+            } catch (_) {}
+        }
+
+        let _selection = null; // { cellId, a, b }
+
+        function clearSelection(updateUrl) {
+            clearLineSelections();
+            _selection = null;
+            if (updateUrl) {
+                try {
+                    const url = new URL(window.location.href);
+                    url.searchParams.delete('cell');
+                    url.searchParams.delete('line');
+                    history.replaceState(null, '', url.toString());
+                } catch (_) {}
+            }
+            updateStateIndicator();
+        }
+
+        function selectCellLine(cellId, line, updateUrl) {
+            try {
+                // Ensure only one selection across the whole document
+                clearLineSelections();
+                const codeBox = document.getElementById(`code-${cellId}`);
+                if (!codeBox) return;
+                const pre = codeBox.querySelector('.highlight pre');
+                const overlay = document.getElementById(`line-highlight-${cellId}`);
+                const numbers = document.getElementById(`lines-${cellId}`);
+                if (!pre || !overlay) return;
+
+                // Measure line height directly from computed style
+                const preStyle = getComputedStyle(pre);
+                const padTop = parseFloat(preStyle.paddingTop || '0');
+                const lh = parseFloat(preStyle.lineHeight || '20');
+
+                // Position overlay
+                overlay.style.display = 'block';
+                overlay.style.height = `${lh}px`;
+                overlay.style.top = `${pre.offsetTop + padTop + (line - 1) * lh}px`;
+
+                // Update selected class in line numbers
+                if (numbers) {
+                    numbers.querySelectorAll('.line-number').forEach(a => a.classList.remove('selected'));
+                    const sel = numbers.querySelector(`.line-number[data-line="${line}"]`);
+                    if (sel) sel.classList.add('selected');
+                }
+
+                if (updateUrl) {
+                    const url = new URL(window.location.href);
+                    url.searchParams.set('cell', cellId);
+                    url.searchParams.set('line', String(line));
+                    history.replaceState(null, '', url.toString());
+                }
+                _selection = { cellId, a: line, b: line };
+                updateStateIndicator();
+            } catch (e) { console.warn('selectCellLine error', e); }
+        }
+
+        function selectCellLines(cellId, startLine, endLine, updateUrl) {
+            try {
+                // normalize order
+                const a = Math.min(startLine, endLine);
+                const b = Math.max(startLine, endLine);
+                clearLineSelections();
+                const codeBox = document.getElementById(`code-${cellId}`);
+                if (!codeBox) return;
+                const pre = codeBox.querySelector('.highlight pre');
+                const overlay = document.getElementById(`line-highlight-${cellId}`);
+                const numbers = document.getElementById(`lines-${cellId}`);
+                if (!pre || !overlay) return;
+
+                const preStyle = getComputedStyle(pre);
+                const padTop = parseFloat(preStyle.paddingTop || '0');
+                const lh = parseFloat(preStyle.lineHeight || '20');
+
+                overlay.style.display = 'block';
+                overlay.style.top = `${pre.offsetTop + padTop + (a - 1) * lh}px`;
+                overlay.style.height = `${(b - a + 1) * lh}px`;
+
+                if (numbers) {
+                    numbers.querySelectorAll('.line-number').forEach(a => a.classList.remove('selected'));
+                    for (let i = a; i <= b; i++) {
+                        const el = numbers.querySelector(`.line-number[data-line="${i}"]`);
+                        if (el) el.classList.add('selected');
+                    }
+                }
+
+                if (updateUrl) {
+                    const url = new URL(window.location.href);
+                    url.searchParams.set('cell', cellId);
+                    if (a === b) url.searchParams.set('line', String(a));
+                    else url.searchParams.set('line', `${a}-${b}`);
+                    history.replaceState(null, '', url.toString());
+                }
+                _selection = { cellId, a, b };
+                updateStateIndicator();
+            } catch (e) { console.warn('selectCellLines error', e); }
+        }
+
+        // Drag-to-select support on line numbers
+        let _lineDrag = { active: false, cellId: null, start: 0 };
+        function onLineNumberMouseDown(e) {
+            const a = e.target.closest('.line-number');
+            if (!a) return;
+            e.preventDefault();
+            const cellId = a.dataset.cell;
+            const line = parseInt(a.dataset.line || '1', 10) || 1;
+            // Toggle off if this exact single line is already the only selection
+            const numbers = document.getElementById(`lines-${cellId}`);
+            if (numbers) {
+                const selected = Array.from(numbers.querySelectorAll('.line-number.selected')).map(n => parseInt(n.dataset.line||'0',10)).filter(Boolean);
+                if (selected.length === 1 && selected[0] === line) {
+                    clearSelection(true);
+                    return;
+                }
+            }
+            _lineDrag.active = true;
+            _lineDrag.cellId = cellId;
+            _lineDrag.start = line;
+            selectCellLines(_lineDrag.cellId, _lineDrag.start, _lineDrag.start, false);
+        }
+        function onDocMouseMove(e) {
+            if (!_lineDrag.active) return;
+            const el = document.elementFromPoint(e.clientX, e.clientY);
+            if (!el) return;
+            const a = el.closest && el.closest('.line-number');
+            if (!a) return;
+            if (a.dataset.cell !== _lineDrag.cellId) return;
+            const cur = parseInt(a.dataset.line || '1', 10) || 1;
+            selectCellLines(_lineDrag.cellId, _lineDrag.start, cur, false);
+        }
+        function onDocMouseUp(e) {
+            if (!_lineDrag.active) return;
+            const last = document.querySelector('.line-number.selected:last-of-type');
+            // finalize URL using the current selected range
+            const numbers = document.getElementById(`lines-${_lineDrag.cellId}`);
+            if (numbers) {
+                const selected = Array.from(numbers.querySelectorAll('.line-number.selected')).map(n => parseInt(n.dataset.line||'0',10)).filter(Boolean);
+                if (selected.length) {
+                    const a = Math.min(...selected); const b = Math.max(...selected);
+                    selectCellLines(_lineDrag.cellId, a, b, true);
+                }
+            }
+            _lineDrag.active = false; _lineDrag.cellId = null; _lineDrag.start = 0;
+        }
+
+        function applyLocationFromUrl() {
+            try {
+                const url = new URL(window.location.href);
+                const cell = url.searchParams.get('cell');
+                const lineParam = url.searchParams.get('line');
+                if (cell && lineParam) {
+                    if (lineParam.includes('-')) {
+                        const [a, b] = lineParam.split('-').map(x => parseInt(x, 10));
+                        if (!Number.isNaN(a) && !Number.isNaN(b)) selectCellLines(cell, a, b, false);
+                    } else {
+                        const l = parseInt(lineParam, 10);
+                        if (!Number.isNaN(l)) selectCellLine(cell, l, false);
+                    }
+                }
+                
+                // Apply tool parameters from URL
+                applyToolsFromUrl(url.searchParams);
+                
+                // Cell states will be applied later in DOMContentLoaded with proper timing
+                const encodedCellStates = url.searchParams.get('cells');
+                console.log('Encoded cell states from URL:', encodedCellStates);
+            } catch (_) {}
+        }
+        
+        function applyToolsFromUrl(params) {
+            try {
+                // Check if tools widget should be shown
+                const showTools = params.get('tools');
+                if (showTools === '1') {
+                    // Mark that tool was loaded from URL
+                    _urlLoadedTool = true;
+                    
+                    // Apply color
+                    const color = params.get('color');
+                    if (color && /^[0-9a-fA-F]{6}$/.test(color)) {
+                        setStoredArrowColor('#' + color);
+                    }
+                    
+                    // Apply thickness
+                    const thickness = params.get('thickness');
+                    if (thickness) {
+                        const value = parseInt(thickness, 10);
+                        if (value >= 1 && value <= 10) {
+                            setStoredLineThickness(value);
+                        }
+                    }
+                    
+                    // Don't override fadeout time for URL-loaded tools - let individual shapes decide
+                    
+                    // Load shapes from URL
+                    const encodedShapes = params.get('shapes');
+                    if (encodedShapes) {
+                        const decodedShapes = decodeShapesFromUrl(encodedShapes);
+                        if (decodedShapes.length > 0) {
+                            _shapes = decodedShapes;
+                            saveShapes();
+                            // Trigger render after overlay is initialized
+                            setTimeout(() => {
+                                renderOverlay();
+                            }, 300);
+                        }
+                    }
+                    
+                    // Wait for widgets to be initialized before showing tools
+                    setTimeout(() => {
+                        const toolsWidget = document.querySelector('.tools-widget');
+                        const checkbox = document.getElementById('checkbox-tools');
+                        if (toolsWidget && checkbox) {
+                            toolsWidget.style.display = 'block';
+                            checkbox.textContent = '☑';
+                            localStorage.setItem('uvnote-widget-tools', 'visible');
+                        }
+                        
+                        // Apply active tool
+                        const activeTool = params.get('tool');
+                        if (activeTool && ['arrow', 'pen', 'eraser', 'spotlight'].includes(activeTool)) {
+                            const toolBtn = Array.from(document.querySelectorAll('.tool-button')).find(btn => btn.textContent === activeTool);
+                            if (toolBtn) {
+                                toolBtn.click();
+                            }
+                        }
+                        
+                        // Re-layout widgets after showing tools
+                        layoutWidgetsStackedBottomRight();
+                    }, 200);
+                }
+            } catch (_) {}
+        }
+        
+        function captureInitialCellStates() {
+            const cells = document.querySelectorAll('.cell');
+            cells.forEach(cell => {
+                const cellId = cell.id.replace('cell-', '');
+                const codeEl = document.getElementById('code-' + cellId);
+                const outputEl = document.getElementById('output-' + cellId);
+                
+                if (codeEl || outputEl) {
+                    const state = {};
+                    if (codeEl) {
+                        state.c = codeEl.classList.contains('collapsed') ? 0 : 1;
+                    }
+                    if (outputEl) {
+                        state.o = outputEl.classList.contains('collapsed') ? 0 : 1;
+                    }
+                    _initialCellStates[cellId] = state;
+                }
+            });
+            console.log('Captured initial cell states:', _initialCellStates);
+        }
+
+        function encodeCellStatesToUrl() {
+            // Get all cells and their collapse states
+            const cells = document.querySelectorAll('.cell');
+            const cellStates = {};
+            
+            console.log('Found cells:', cells.length);
+            
+            cells.forEach(cell => {
+                const cellId = cell.id.replace('cell-', '');
+                const codeEl = document.getElementById('code-' + cellId);
+                const outputEl = document.getElementById('output-' + cellId);
+                const initialState = _initialCellStates[cellId] || {};
+                
+                console.log(`Encoding cell ${cellId}:`, {
+                    codeEl: !!codeEl,
+                    outputEl: !!outputEl,
+                    codeCollapsed: codeEl ? codeEl.classList.contains('collapsed') : 'N/A',
+                    outputCollapsed: outputEl ? outputEl.classList.contains('collapsed') : 'N/A',
+                    initialState: initialState
+                });
+                
+                if (codeEl || outputEl) {
+                    const state = {};
+                    let hasChanges = false;
+                    
+                    if (codeEl) {
+                        const currentCodeState = codeEl.classList.contains('collapsed') ? 0 : 1;
+                        const initialCodeState = initialState.c;
+                        // Only encode if different from initial state
+                        if (initialCodeState !== undefined && currentCodeState !== initialCodeState) {
+                            state.c = currentCodeState;
+                            hasChanges = true;
+                        }
+                    }
+                    
+                    if (outputEl) {
+                        const currentOutputState = outputEl.classList.contains('collapsed') ? 0 : 1;
+                        const initialOutputState = initialState.o;
+                        // Only encode if different from initial state
+                        if (initialOutputState !== undefined && currentOutputState !== initialOutputState) {
+                            state.o = currentOutputState;
+                            hasChanges = true;
+                        }
+                    }
+                    
+                    // Only include cell if it has changes from initial state
+                    if (hasChanges) {
+                        cellStates[cellId] = state;
+                        console.log(`Added cell ${cellId}:`, state);
+                    }
+                }
+            });
+            
+            console.log('Final cell states to encode:', cellStates);
+            
+            // Return empty string if no changed cells
+            if (Object.keys(cellStates).length === 0) return '';
+            
+            // Encode as compact base64 string
+            const encoded = btoa(JSON.stringify(cellStates));
+            console.log('Encoded cell states:', encoded);
+            return encoded;
+        }
+        
+        function decodeCellStatesFromUrl(encodedStates) {
+            if (!encodedStates) return {};
+            
+            try {
+                return JSON.parse(atob(encodedStates));
+            } catch (e) {
+                console.error('Failed to decode cell states:', e);
+                return {};
+            }
+        }
+        
+        function applyCellStatesFromUrl(cellStates) {
+            console.log('Applying cell states from URL:', cellStates);
+            Object.entries(cellStates).forEach(([cellId, state]) => {
+                const codeEl = document.getElementById('code-' + cellId);
+                const outputEl = document.getElementById('output-' + cellId);
+                
+                console.log(`Cell ${cellId}:`, {
+                    codeEl: !!codeEl,
+                    outputEl: !!outputEl,
+                    state: state
+                });
+                
+                if (codeEl && state.c !== undefined) {
+                    if (state.c === 0) {
+                        codeEl.classList.add('collapsed');
+                        console.log(`Collapsed code for cell ${cellId}`, {
+                            hasCollapsedClass: codeEl.classList.contains('collapsed'),
+                            computedDisplay: getComputedStyle(codeEl).display,
+                            classList: Array.from(codeEl.classList),
+                            elementId: codeEl.id
+                        });
+                    } else {
+                        codeEl.classList.remove('collapsed');
+                        codeEl.classList.add('expanded'); // Explicitly add expanded class
+                        console.log(`Expanded code for cell ${cellId}`, {
+                            hasCollapsedClass: codeEl.classList.contains('collapsed'),
+                            hasExpandedClass: codeEl.classList.contains('expanded'),
+                            computedDisplay: getComputedStyle(codeEl).display,
+                            classList: Array.from(codeEl.classList),
+                            elementId: codeEl.id
+                        });
+                    }
+                }
+                
+                if (outputEl && state.o !== undefined) {
+                    if (state.o === 0) {
+                        outputEl.classList.add('collapsed');
+                        console.log(`Collapsed output for cell ${cellId}`);
+                    } else {
+                        outputEl.classList.remove('collapsed');
+                        console.log(`Expanded output for cell ${cellId}`);
+                    }
+                }
+                
+                // Update visual indicators and force style recalculation
+                try {
+                    updateIndicators(cellId);
+                    // Force browser to recalculate styles
+                    if (codeEl) {
+                        codeEl.offsetHeight; // Force reflow
+                        console.log(`After indicators update - code visible: ${getComputedStyle(codeEl).display !== 'none'}`);
+                    }
+                    if (outputEl) {
+                        outputEl.offsetHeight; // Force reflow
+                        console.log(`After indicators update - output visible: ${getComputedStyle(outputEl).display !== 'none'}`);
+                    }
+                } catch (e) {
+                    console.error(`Error updating indicators for cell ${cellId}:`, e);
+                }
+            });
+        }
+
+        function encodeShapesToUrl() {
+            // Encode shapes as compact base64 string
+            if (_shapes.length === 0) return '';
+            
+            const shapeData = _shapes.map(shape => {
+                const baseData = {
+                    ct: shape.createdAt, // creation timestamp
+                    fo: shape.fadeoutTime || getFadeoutTime() // fadeout time for this shape
+                };
+                
+                if (shape.type === 'arrow') {
+                    return {
+                        ...baseData,
+                        t: 'a',
+                        x1: Math.round(shape.x1),
+                        y1: Math.round(shape.y1),
+                        x2: Math.round(shape.x2),
+                        y2: Math.round(shape.y2),
+                        c: shape.color.substring(1), // remove #
+                        w: shape.width
+                    };
+                } else if (shape.type === 'pen') {
+                    return {
+                        ...baseData,
+                        t: 'p',
+                        pts: shape.points.map(p => [Math.round(p.x), Math.round(p.y)]),
+                        c: shape.color.substring(1),
+                        w: shape.width
+                    };
+                } else if (shape.type === 'spotlight') {
+                    return {
+                        ...baseData,
+                        t: 's',
+                        x: Math.round(shape.x),
+                        y: Math.round(shape.y),
+                        r: Math.round(shape.radius)
+                    };
+                }
+            }).filter(Boolean);
+            
+            return btoa(JSON.stringify(shapeData));
+        }
+        
+        function decodeShapesFromUrl(encodedShapes) {
+            if (!encodedShapes) return [];
+            
+            try {
+                const shapeData = JSON.parse(atob(encodedShapes));
+                return shapeData.map(data => {
+                    const base = {
+                        createdAt: data.ct || Date.now(), // use encoded timestamp or current time
+                        fadeoutTime: data.fo || 0, // use encoded fadeout time or 0 (never fade)
+                        opacity: 1.0
+                    };
+                    
+                    if (data.t === 'a') {
+                        return {
+                            ...base,
+                            type: 'arrow',
+                            x1: data.x1,
+                            y1: data.y1,
+                            x2: data.x2,
+                            y2: data.y2,
+                            color: '#' + data.c,
+                            width: data.w
+                        };
+                    } else if (data.t === 'p') {
+                        return {
+                            ...base,
+                            type: 'pen',
+                            points: data.pts.map(([x, y]) => ({ x, y })),
+                            color: '#' + data.c,
+                            width: data.w
+                        };
+                    } else if (data.t === 's') {
+                        return {
+                            ...base,
+                            type: 'spotlight',
+                            x: data.x,
+                            y: data.y,
+                            radius: data.r,
+                            color: '#000000'
+                        };
+                    }
+                }).filter(Boolean);
+            } catch (e) {
+                console.error('Failed to decode shapes:', e);
+                return [];
+            }
+        }
+        
+        function encodeToolStateToUrl() {
+            // Don't update URL during initialization
+            if (_isInitializing) {
+                return window.location.href;
+            }
+            
+            const params = new URLSearchParams(window.location.search);
+            
+            // Check if tools widget is visible and has an active tool
+            const toolsWidget = document.querySelector('.tools-widget');
+            const activeTool = document.body.dataset.tool;
+            const hasActiveTool = activeTool && activeTool !== 'none';
+            const toolsWidgetVisible = toolsWidget && getComputedStyle(toolsWidget).display !== 'none';
+            
+            // Always handle shapes regardless of tool state
+            const encodedShapes = encodeShapesToUrl();
+            if (encodedShapes) {
+                params.set('shapes', encodedShapes);
+            } else {
+                params.delete('shapes');
+            }
+            
+            // Always preserve existing cell states from URL if present
+            const existingCellStates = params.get('cells');
+            if (existingCellStates) {
+                // Keep existing cell states - don't re-encode from DOM
+                params.set('cells', existingCellStates);
+            } else {
+                // Only encode new cell states if none exist in URL
+                const encodedCellStates = encodeCellStatesToUrl();
+                if (encodedCellStates) {
+                    params.set('cells', encodedCellStates);
+                }
+            }
+            
+            if (toolsWidgetVisible && hasActiveTool) {
+                // Include tool params when widget is visible AND tool is active
+                params.set('tools', '1');
+                params.set('tool', activeTool);
+                
+                // Get color (without # prefix)
+                const color = getArrowColor();
+                if (color && color.startsWith('#')) {
+                    params.set('color', color.substring(1));
+                }
+                
+                // Get thickness
+                const thickness = getLineThickness();
+                params.set('thickness', thickness.toString());
+            } else {
+                // Remove tool state params but keep shapes
+                params.delete('tools');
+                params.delete('tool');
+                params.delete('color');
+                params.delete('thickness');
+                params.delete('fadeout');
+            }
+            
+            // Update URL without reloading
+            const newUrl = window.location.pathname + (params.toString() ? '?' + params.toString() : '') + window.location.hash;
+            window.history.replaceState(null, '', newUrl);
+            
+            return window.location.href;
+        }
+
+        function resetLayout() {
+            try {
+                // Clear all uvnote-* keys
+                const allKeys = Object.keys(localStorage);
+                const uvnoteKeys = allKeys.filter(key => key.startsWith('uvnote-'));
+                uvnoteKeys.forEach(k => localStorage.removeItem(k));
+            } catch (_) {}
+            
+            // Clear any active selection and remove URL params
+            try { clearSelection(true); } catch(_) {}
+            // Reset active tool if any
+            try { window.setActiveTool('none'); } catch(_) {}
+            // Clear shapes
+            try { _shapes = []; saveShapes(); } catch(_) {}
+            // Reset URL-loaded tool flag
+            try { _urlLoadedTool = false; } catch(_) {}
+            // Reset all cells to expanded state
+            try {
+                const cells = document.querySelectorAll('.cell');
+                cells.forEach(cell => {
+                    const cellId = cell.id.replace('cell-', '');
+                    const codeEl = document.getElementById('code-' + cellId);
+                    const outputEl = document.getElementById('output-' + cellId);
+                    if (codeEl) codeEl.classList.remove('collapsed');
+                    if (outputEl) outputEl.classList.remove('collapsed');
+                    updateIndicators(cellId);
+                });
+            } catch(_) {}
+            
+            // Clear ALL URL parameters and reload with clean URL
+            try {
+                const cleanUrl = window.location.pathname + window.location.hash;
+                window.location.href = cleanUrl; // Use window.location.href instead of history.replaceState + reload
+            } catch (_) {
+                // Fallback - reload current page
+                location.reload();
+            }
+        }
+
+        function toggleMenu() {
+            const menuButton = document.querySelector('.menu-button');
+            if (menuButton) {
+                menuButton.classList.toggle('active');
+            }
+        }
+
+        function toggleWidget(widgetName) {
+            let widget;
+            let checkbox;
+            
+            // Close the menu first
+            const menuButton = document.querySelector('.menu-button');
+            if (menuButton) {
+                menuButton.classList.remove('active');
+            }
+            
+            switch(widgetName) {
+                case 'tools':
+                    widget = document.querySelector('.tools-widget');
+                    checkbox = document.getElementById('checkbox-tools');
+                    break;
+                case 'file-explorer':
+                    widget = document.querySelector('.file-explorer');
+                    checkbox = document.getElementById('checkbox-file-explorer');
+                    break;
+                case 'minimap':
+                    widget = document.querySelector('.minimap');
+                    checkbox = document.getElementById('checkbox-minimap');
+                    break;
+                case 'status':
+                    widget = document.querySelector('.status-widget');
+                    checkbox = document.getElementById('checkbox-status');
+                    break;
+                default:
+                    return;
+            }
+            
+            if (widget && checkbox) {
+                const isVisible = getComputedStyle(widget).display !== 'none';
+                widget.style.display = isVisible ? 'none' : 'block';
+                checkbox.textContent = isVisible ? '☐' : '☑';
+                
+                // Save state to localStorage
+                try {
+                    localStorage.setItem(`uvnote-widget-${widgetName}`, isVisible ? 'hidden' : 'visible');
+                } catch (_) {}
+                
+                // Re-layout widgets after visibility change
+                try { 
+                    layoutWidgetsStackedBottomRight(); 
+                } catch (_) {}
+                
+                // Update URL when tools widget visibility changes
+                if (widgetName === 'tools') {
+                    encodeToolStateToUrl();
+                }
+            }
+        }
+        
+        function initializeWidgetVisibility() {
+            const widgets = [
+                { name: 'tools', selector: '.tools-widget' },
+                { name: 'file-explorer', selector: '.file-explorer' },
+                { name: 'minimap', selector: '.minimap' },
+                { name: 'status', selector: '.status-widget' }
+            ];
+            
+            widgets.forEach(({ name, selector }) => {
+                const defaultState = name === 'status' ? 'visible' : 'hidden';
+                const savedState = localStorage.getItem(`uvnote-widget-${name}`) || defaultState;
+                const widget = document.querySelector(selector);
+                const checkbox = document.getElementById(`checkbox-${name}`);
+                
+                if (widget && checkbox) {
+                    const isVisible = savedState === 'visible';
+                    widget.style.display = isVisible ? 'block' : 'none';
+                    checkbox.textContent = isVisible ? '☑' : '☐';
+                }
+            });
+        }
+
+        // Close menu when clicking outside
+        document.addEventListener('click', function(event) {
+            const menuButton = document.querySelector('.menu-button');
+            // Don't close if clicking on a menu item (let the item handler close it)
+            if (menuButton && !menuButton.contains(event.target)) {
+                menuButton.classList.remove('active');
+            }
+        });
+
+        // Layout: stack widgets bottom-right and equalize widths
+        function hasCustomWidgetPositions() {
+            try {
+                return (
+                    localStorage.getItem('uvnote-minimap-pos') ||
+                    localStorage.getItem('uvnote-file-explorer-pos') ||
+                    localStorage.getItem('uvnote-tools-pos')
+                );
+            } catch (_) { return false; }
+        }
+
+        function rectsOverlap(r1, r2) {
+            return !(r1.right <= r2.left || r2.right <= r1.left || r1.bottom <= r2.top || r2.bottom <= r1.top);
+        }
+
+        function widgetsOverlap(widgets) {
+            for (let i = 0; i < widgets.length; i++) {
+                const a = widgets[i];
+                const ra = a.getBoundingClientRect();
+                for (let j = i + 1; j < widgets.length; j++) {
+                    const b = widgets[j];
+                    const rb = b.getBoundingClientRect();
+                    if (rectsOverlap(ra, rb)) return true;
+                }
+            }
+            return false;
+        }
+
+        function applyStackLayout(widgets, order) {
+            if (!widgets.length) return;
+            // Fixed equal width
+            const fixedWidth = 220;
+            widgets.forEach(el => { el.style.width = fixedWidth + 'px'; });
+
+            // Fit heights if needed to avoid overflow
+            const gap = 12;
+            const available = Math.max(0, window.innerHeight - 40 - gap * (order.length - 1));
+            const eachMax = Math.floor(available / order.length);
+            order.forEach(el => {
+                el.style.maxHeight = eachMax + 'px';
+                el.style.overflowY = 'auto';
+            });
+
+            // Stack bottom-up in the requested order
+            let bottomOffset = 20; // base gutter
+            order.forEach(el => {
+                el.style.left = 'auto';
+                el.style.top = 'auto';
+                el.style.right = '20px';
+                el.style.bottom = bottomOffset + 'px';
+                bottomOffset += el.offsetHeight + gap;
+            });
+        }
+
+        function layoutWidgetsStackedBottomRight() {
+            const minimap = document.querySelector('.minimap');
+            const fileExplorer = document.querySelector('.file-explorer');
+            const tools = document.querySelector('.tools-widget');
+            const status = document.querySelector('.status-widget');
+            const widgets = [minimap, fileExplorer, tools, status].filter(el => el && getComputedStyle(el).display !== 'none');
+            if (!widgets.length) return;
+
+            const order = [minimap, fileExplorer, tools, status].filter(Boolean).filter(el => getComputedStyle(el).display !== 'none');
+
+            // If user placed custom positions and there is no overlap, respect them.
+            if (hasCustomWidgetPositions() && !widgetsOverlap(widgets)) return;
+
+            applyStackLayout(widgets, order);
+        }
+        
+        // Panel icon removed
+        
+        let _minimapScrollContainer = null;
+        let _minimapScrollHandler = null;
+        function initMinimap() {
+            // Generate minimap content
+            const minimap = createMinimap();
+            document.body.appendChild(minimap);
+            // Make draggable (use title as handle)
+            const mTitle = minimap.querySelector('.minimap-title');
+            makeDraggable(minimap, 'uvnote-minimap-pos', mTitle);
+
+            // Attach scroll listener to window (two-panel removed)
+            _minimapScrollContainer = window;
+
+            if (_minimapScrollContainer) {
+                _minimapScrollHandler = () => updateMinimapActive();
+                if (_minimapScrollContainer === window) {
+                    window.addEventListener('scroll', _minimapScrollHandler);
+                } else {
+                    _minimapScrollContainer.addEventListener('scroll', _minimapScrollHandler);
+                }
+            }
+            updateMinimapActive();
+        }
+
+        function teardownMinimap() {
+            const minimap = document.querySelector('.minimap');
+            if (minimap && minimap.parentNode) minimap.parentNode.removeChild(minimap);
+            if (_minimapScrollContainer && _minimapScrollHandler) {
+                if (_minimapScrollContainer === window) {
+                    window.removeEventListener('scroll', _minimapScrollHandler);
+                } else {
+                    _minimapScrollContainer.removeEventListener('scroll', _minimapScrollHandler);
+                }
+            }
+            _minimapScrollContainer = null;
+            _minimapScrollHandler = null;
+        }
+        
+        function initFileExplorer() {
+            // Generate file explorer content
+            const fileExplorer = createFileExplorer();
+            document.body.appendChild(fileExplorer);
+        }
+        
+        function createMinimap() {
+            const minimap = document.createElement('div');
+            minimap.className = 'minimap';
+            
+            const title = document.createElement('div');
+            title.className = 'minimap-title';
+            title.textContent = 'navigation';
+            minimap.appendChild(title);
+            
+            // Find all headings and cells
+            const root = document.querySelector('.main-content') || document;
+            const headings = root.querySelectorAll('h1, h2, h3, h4, h5, h6');
+            const cells = root.querySelectorAll('.cell');
+            
+            // Combine and sort by position
+            const items = [];
+            
+            headings.forEach(heading => {
+                const id = heading.id || generateId(heading.textContent);
+                if (!heading.id) heading.id = id;
+                
+                items.push({
+                    element: heading,
+                    type: 'heading',
+                    level: parseInt(heading.tagName.charAt(1)),
+                    text: heading.textContent.trim(),
+                    id: id,
+                    position: heading.getBoundingClientRect().top + window.scrollY
+                });
+            });
+            
+            cells.forEach(cell => {
+                const header = cell.querySelector('.cell-header');
+                if (header) {
+                    const id = cell.id || `cell-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`;
+                    if (!cell.id) cell.id = id;
+                    
+                    items.push({
+                        element: cell,
+                        type: 'cell',
+                        text: header.textContent.trim(),
+                        id: id,
+                        position: cell.getBoundingClientRect().top + window.scrollY
+                    });
+                }
+            });
+            
+            // Sort by position
+            items.sort((a, b) => a.position - b.position);
+            
+            // Create minimap items
+            items.forEach(item => {
+                const link = document.createElement('a');
+                link.className = `minimap-item ${item.type === 'heading' ? 'minimap-heading' : 'minimap-cell'}`;
+                if (item.type === 'heading') {
+                    link.classList.add(`h${item.level}`);
+                }
+                link.textContent = item.text.length > 25 ? item.text.substring(0, 22) + '...' : item.text;
+                link.href = `#${item.id}`;
+                link.onclick = function(e) {
+                    e.preventDefault();
+                    item.element.scrollIntoView({ behavior: 'smooth', block: 'start' });
+                };
+                minimap.appendChild(link);
+            });
+            
+            return minimap;
+        }
+        
+        function generateId(text) {
+            return text.toLowerCase()
+                .replace(/[^a-z0-9]+/g, '-')
+                .replace(/^-+|-+$/g, '')
+                .substring(0, 20);
+        }
+        
+        function updateMinimapActive() {
+            const minimapItems = document.querySelectorAll('.minimap-item');
+            const container = _minimapScrollContainer || window;
+            const containerRect = container === window ? null : container.getBoundingClientRect();
+            const scrollPos = (container === window ? window.scrollY : container.scrollTop) + 100; // Offset for better detection
+            
+            let activeItem = null;
+            minimapItems.forEach(item => {
+                const targetId = item.getAttribute('href').substring(1);
+                const target = document.getElementById(targetId);
+                
+                if (target) {
+                    const rectTop = target.getBoundingClientRect().top;
+                    const targetPos = (container === window)
+                        ? rectTop + window.scrollY
+                        : rectTop - containerRect.top + container.scrollTop;
+                    if (targetPos <= scrollPos) {
+                        activeItem = item;
+                    }
+                }
+                
+                item.classList.remove('active');
+            });
+            
+            if (activeItem) {
+                activeItem.classList.add('active');
+            }
+        }
+        
+        function createFileExplorer() {
+            const fileExplorer = document.createElement('div');
+            fileExplorer.className = 'file-explorer';
+            
+            const title = document.createElement('div');
+            title.className = 'file-explorer-title';
+            title.textContent = 'files';
+            fileExplorer.appendChild(title);
+            // Make draggable (use title as handle)
+            makeDraggable(fileExplorer, 'uvnote-file-explorer-pos', title);
+            
+            // Scripts section
+            const scriptsSection = document.createElement('div');
+            scriptsSection.className = 'file-explorer-section';
+            
+            const scriptsTitle = document.createElement('div');
+            scriptsTitle.className = 'file-explorer-section-title';
+            scriptsTitle.textContent = 'scripts';
+            scriptsSection.appendChild(scriptsTitle);
+            
+            // Find all cells and list their script files (single panel)
+            const root = document.querySelector('.main-content') || document;
+            const cells = root.querySelectorAll('.cell');
+            cells.forEach(cell => {
+                const header = cell.querySelector('.cell-header');
+                if (header) {
+                    const cellText = header.textContent.trim();
+                    const cellMatch = cellText.match(/Cell: ([a-zA-Z_][a-zA-Z0-9_]*)/);
+                    if (cellMatch) {
+                        const cellId = cellMatch[1];
+                        const scriptItem = document.createElement('div');
+                        scriptItem.className = 'file-explorer-item script';
+                        scriptItem.textContent = `${cellId}.py`;
+                        scriptItem.onclick = function() {
+                            cell.scrollIntoView({ behavior: 'smooth', block: 'start' });
+                        };
+                        scriptsSection.appendChild(scriptItem);
+                    }
+                }
+            });
+            
+            fileExplorer.appendChild(scriptsSection);
+            
+            // Artifacts section
+            const artifactsSection = document.createElement('div');
+            artifactsSection.className = 'file-explorer-section';
+            
+            const artifactsTitle = document.createElement('div');
+            artifactsTitle.className = 'file-explorer-section-title';
+            artifactsTitle.textContent = 'artifacts';
+            artifactsSection.appendChild(artifactsTitle);
+            
+            // Find all artifact links (single panel)
+            const artifactsRoot = document.querySelector('.main-content') || document;
+            const artifacts = artifactsRoot.querySelectorAll('.artifact');
+            if (artifacts.length === 0) {
+                const noArtifacts = document.createElement('div');
+                noArtifacts.className = 'file-explorer-item artifact';
+                noArtifacts.textContent = '(none)';
+                noArtifacts.style.opacity = '0.5';
+                artifactsSection.appendChild(noArtifacts);
+            } else {
+                artifacts.forEach(artifact => {
+                    const artifactItem = document.createElement('div');
+                    artifactItem.className = 'file-explorer-item artifact';
+                    artifactItem.textContent = artifact.textContent;
+                    artifactItem.onclick = function() {
+                        artifact.click();
+                    };
+                    artifactsSection.appendChild(artifactItem);
+                });
+            }
+            
+            fileExplorer.appendChild(artifactsSection);
+            
+            return fileExplorer;
+        }
+
+        function initStatusWidget() {
+            let el = document.querySelector('.status-widget');
+            if (!el) {
+                el = document.createElement('div');
+                el.className = 'status-widget';
+                el.id = 'status-widget';
+                el.textContent = 'ready — Esc';
+                document.body.appendChild(el);
+            }
+        }
+
+        // Tools widget
+        let _cursorX = 0;
+        let _cursorY = 0;
+        let _cursorVisible = false;
+        
+        function setActiveTool(tool) {
+            if (!tool || tool === 'none') {
+                document.body.dataset.tool = 'none';
+                localStorage.setItem('uvnote-active-tool', 'none');
+                setOverlayActive(false);
+                _cursorVisible = false;
+                // Remove active class from all tool buttons when deactivating
+                const toolButtons = document.querySelectorAll('.tools-widget .tool-button');
+                toolButtons.forEach(btn => btn.classList.remove('active'));
+                updateStateIndicator();
+                encodeToolStateToUrl();
+                return;
+            }
+            document.body.dataset.tool = tool;
+            localStorage.setItem('uvnote-active-tool', tool);
+            setOverlayActive(true);
+            _cursorVisible = true;
+            updateStateIndicator();
+            encodeToolStateToUrl();
+        }
+
+        // Make setActiveTool globally accessible for ESC key handler
+        window.setActiveTool = setActiveTool;
+        
+
+        function getArrowColor() {
+            const saved = localStorage.getItem('uvnote-arrow-color');
+            if (saved) return saved;
+            return '#e53935'; // Default red color
+        }
+
+        function setStoredArrowColor(color) {
+            try { localStorage.setItem('uvnote-arrow-color', color); } catch (_) {}
+        }
+
+        function getLineThickness() {
+            const saved = localStorage.getItem('uvnote-line-thickness');
+            if (saved) return parseInt(saved, 10);
+            return 6; // default thickness
+        }
+
+        function setStoredLineThickness(thickness) {
+            try { localStorage.setItem('uvnote-line-thickness', thickness); } catch (_) {}
+        }
+
+        function getFadeoutTime() {
+            const saved = localStorage.getItem('uvnote-fadeout-time');
+            if (saved) return parseInt(saved, 10);
+            return 5; // default 5 seconds
+        }
+
+        function setStoredFadeoutTime(seconds) {
+            try { localStorage.setItem('uvnote-fadeout-time', seconds); } catch (_) {}
+        }
+
+        function createToolsWidget() {
+            const tools = document.createElement('div');
+            tools.className = 'tools-widget';
+
+            const title = document.createElement('div');
+            title.className = 'tools-title';
+            title.textContent = 'tools';
+            tools.appendChild(title);
+
+            const row = document.createElement('div');
+            row.className = 'tools-row';
+            tools.appendChild(row);
+
+            // Arrow tool
+            const arrowBtn = document.createElement('div');
+            arrowBtn.className = 'tool-button';
+            arrowBtn.textContent = 'arrow';
+            arrowBtn.onclick = function() {
+                const isActive = arrowBtn.classList.contains('active');
+                if (isActive) {
+                    arrowBtn.classList.remove('active');
+                    setActiveTool('none');
+                } else {
+                    tools.querySelectorAll('.tool-button').forEach(b => b.classList.remove('active'));
+                    arrowBtn.classList.add('active');
+                    setActiveTool('arrow');
+                }
+            };
+            row.appendChild(arrowBtn);
+
+            // Pen tool
+            const penBtn = document.createElement('div');
+            penBtn.className = 'tool-button';
+            penBtn.textContent = 'pen';
+            penBtn.onclick = function() {
+                const isActive = penBtn.classList.contains('active');
+                if (isActive) {
+                    penBtn.classList.remove('active');
+                    setActiveTool('none');
+                } else {
+                    tools.querySelectorAll('.tool-button').forEach(b => b.classList.remove('active'));
+                    penBtn.classList.add('active');
+                    setActiveTool('pen');
+                }
+            };
+            row.appendChild(penBtn);
+
+            // Eraser tool
+            const eraseBtn = document.createElement('div');
+            eraseBtn.className = 'tool-button';
+            eraseBtn.textContent = 'eraser';
+            eraseBtn.onclick = function() {
+                const isActive = eraseBtn.classList.contains('active');
+                if (isActive) {
+                    eraseBtn.classList.remove('active');
+                    setActiveTool('none');
+                } else {
+                    tools.querySelectorAll('.tool-button').forEach(b => b.classList.remove('active'));
+                    eraseBtn.classList.add('active');
+                    setActiveTool('eraser');
+                }
+            };
+            row.appendChild(eraseBtn);
+
+            // Spotlight tool
+            const spotlightBtn = document.createElement('div');
+            spotlightBtn.className = 'tool-button';
+            spotlightBtn.textContent = 'spotlight';
+            spotlightBtn.onclick = function() {
+                const isActive = spotlightBtn.classList.contains('active');
+                if (isActive) {
+                    spotlightBtn.classList.remove('active');
+                    setActiveTool('none');
+                } else {
+                    tools.querySelectorAll('.tool-button').forEach(b => b.classList.remove('active'));
+                    spotlightBtn.classList.add('active');
+                    setActiveTool('spotlight');
+                }
+            };
+            row.appendChild(spotlightBtn);
+
+            // Clear all
+            const clearBtn = document.createElement('div');
+            clearBtn.className = 'tool-button';
+            clearBtn.textContent = 'clear';
+            clearBtn.onclick = function() {
+                _shapes = [];
+                saveShapes();
+                renderOverlay();
+            };
+            row.appendChild(clearBtn);
+            
+            // We'll add the copy button at the end of the widget
+
+            // Restore active state from storage
+            const saved = localStorage.getItem('uvnote-active-tool') || 'none';
+            if (saved === 'arrow') {
+                arrowBtn.classList.add('active');
+                setActiveTool('arrow');
+            } else if (saved === 'pen') {
+                penBtn.classList.add('active');
+                setActiveTool('pen');
+            } else if (saved === 'eraser') {
+                eraseBtn.classList.add('active');
+                setActiveTool('eraser');
+            } else if (saved === 'spotlight') {
+                spotlightBtn.classList.add('active');
+                setActiveTool('spotlight');
+            }
+
+            // Color selector
+            const colorTitle = document.createElement('div');
+            colorTitle.className = 'tools-section-title';
+            colorTitle.textContent = 'color';
+            tools.appendChild(colorTitle);
+
+            const colorRow = document.createElement('div');
+            colorRow.className = 'tools-row color-row';
+            tools.appendChild(colorRow);
+
+            const swatchColors = [
+                // Primary colors
+                '#e53935', '#fb8c00', '#fdd835', '#43a047', '#1e88e5', '#8e24aa',
+                // Additional useful colors  
+                '#ff5722', '#795548', '#607d8b', '#9c27b0',
+                // Grayscale
+                '#000000', '#424242', '#9e9e9e', '#ffffff'
+            ];
+            const swatches = [];
+            swatchColors.forEach(c => {
+                const s = document.createElement('div');
+                s.className = 'color-swatch';
+                s.style.backgroundColor = c;
+                s.title = c;
+                s.onclick = () => {
+                    setStoredArrowColor(c);
+                    refreshColorUI(c);
+                    if (_cursorVisible) renderOverlay();
+                    encodeToolStateToUrl();
+                };
+                colorRow.appendChild(s);
+                swatches.push(s);
+            });
+
+            const colorInput = document.createElement('input');
+            colorInput.type = 'color';
+            colorInput.className = 'color-input';
+            colorInput.oninput = () => {
+                setStoredArrowColor(colorInput.value);
+                refreshColorUI(colorInput.value);
+                if (_cursorVisible) renderOverlay();
+                encodeToolStateToUrl();
+            };
+            colorRow.appendChild(colorInput);
+
+            function refreshColorUI(selected) {
+                const selectedHex = selected.startsWith('#') ? selected.toLowerCase() : rgbToHex(selected);
+                
+                swatches.forEach((s, i) => {
+                    const swatchHex = swatchColors[i].toLowerCase();
+                    if (swatchHex === selectedHex) {
+                        s.classList.add('selected');
+                    } else {
+                        s.classList.remove('selected');
+                    }
+                });
+                
+                try { 
+                    colorInput.value = selectedHex; 
+                } catch (_) {}
+            }
+
+            function rgbToHex(rgb) {
+                const m = rgb.match(/rgba?\((\d+),\s*(\d+),\s*(\d+)\)/i);
+                if (!m) return '#000000';
+                const r = parseInt(m[1]).toString(16).padStart(2, '0');
+                const g = parseInt(m[2]).toString(16).padStart(2, '0');
+                const b = parseInt(m[3]).toString(16).padStart(2, '0');
+                return `#${r}${g}${b}`;
+            }
+
+            // Restore color selection
+            refreshColorUI(getArrowColor());
+
+            // Thickness slider
+            const thicknessTitle = document.createElement('div');
+            thicknessTitle.className = 'tools-section-title';
+            thicknessTitle.textContent = 'thickness';
+            tools.appendChild(thicknessTitle);
+
+            const thicknessRow = document.createElement('div');
+            thicknessRow.className = 'thickness-row';
+            tools.appendChild(thicknessRow);
+
+            const thicknessSlider = document.createElement('input');
+            thicknessSlider.type = 'range';
+            thicknessSlider.className = 'thickness-slider';
+            thicknessSlider.min = '1';
+            thicknessSlider.max = '10';
+            thicknessSlider.value = getLineThickness();
+            
+            const thicknessValue = document.createElement('span');
+            thicknessValue.className = 'thickness-value';
+            thicknessValue.textContent = thicknessSlider.value + 'px';
+
+            thicknessSlider.oninput = function() {
+                const value = parseInt(thicknessSlider.value, 10);
+                setStoredLineThickness(value);
+                thicknessValue.textContent = value + 'px';
+                if (_cursorVisible) renderOverlay();
+                encodeToolStateToUrl();
+            };
+
+            thicknessRow.appendChild(thicknessSlider);
+            thicknessRow.appendChild(thicknessValue);
+
+            // Fadeout time slider
+            const fadeoutTitle = document.createElement('div');
+            fadeoutTitle.className = 'tools-section-title';
+            fadeoutTitle.textContent = 'fadeout time';
+            tools.appendChild(fadeoutTitle);
+
+            const fadeoutRow = document.createElement('div');
+            fadeoutRow.className = 'thickness-row';
+            tools.appendChild(fadeoutRow);
+
+            const fadeoutSlider = document.createElement('input');
+            fadeoutSlider.type = 'range';
+            fadeoutSlider.className = 'thickness-slider';
+            fadeoutSlider.min = '0';
+            fadeoutSlider.max = '30';
+            fadeoutSlider.value = getFadeoutTime();
+            
+            const fadeoutValue = document.createElement('span');
+            fadeoutValue.className = 'thickness-value';
+            fadeoutValue.textContent = fadeoutSlider.value === '0' ? 'never' : fadeoutSlider.value + 's';
+
+            fadeoutSlider.oninput = function() {
+                const value = parseInt(fadeoutSlider.value, 10);
+                setStoredFadeoutTime(value);
+                fadeoutValue.textContent = value === 0 ? 'never' : value + 's';
+                encodeToolStateToUrl();
+            };
+
+            fadeoutRow.appendChild(fadeoutSlider);
+            fadeoutRow.appendChild(fadeoutValue);
+
+            // Draggable behavior
+            makeDraggable(tools, 'uvnote-tools-pos', title);
+
+            return tools;
+        }
+
+        function initTools() {
+            const widget = createToolsWidget();
+            document.body.appendChild(widget);
+        }
+
+        function teardownTools() {
+            const w = document.querySelector('.tools-widget');
+            if (w && w.parentNode) w.parentNode.removeChild(w);
+        }
+
+        // --- Canvas overlay for tools ---
+        let _overlay = null;
+        let _overlayCtx = null;
+        let _overlayContainer = null; // window
+        let _overlayMode = 'single';
+        let _overlayResizeHandler = null;
+        let _overlayScrollHandler = null;
+        let _drawing = null; // current in-progress arrow {x1,y1,x2,y2}
+        let _shapes = []; // committed shapes for current mode
+        let _fadeTimer = null; // timer for fade animation
+        let _urlLoadedTool = false; // track if tool was loaded from URL
+        let _isInitializing = true; // prevent URL updates during initialization
+        let _initialCellStates = {}; // track initial cell states from page load
+
+        function getOverlayStorageKey() { return 'uvnote-shapes'; }
+
+        function loadShapes() {
+            try {
+                const raw = localStorage.getItem(getOverlayStorageKey());
+                _shapes = raw ? JSON.parse(raw) : [];
+            } catch (_) { _shapes = []; }
+        }
+
+        function saveShapes() {
+            try { 
+                localStorage.setItem(getOverlayStorageKey(), JSON.stringify(_shapes));
+                // Always update URL when shapes change
+                encodeToolStateToUrl();
+            } catch (_) {}
+        }
+
+        function updateShapesFade() {
+            const now = Date.now();
+            let needsUpdate = false;
+
+            for (let i = _shapes.length - 1; i >= 0; i--) {
+                const shape = _shapes[i];
+                if (!shape.createdAt) continue; // Skip old shapes without timestamps
+                
+                // Use individual shape's fadeout time, or global if not set
+                const shapesFadeoutSeconds = shape.fadeoutTime !== undefined ? shape.fadeoutTime : getFadeoutTime();
+                
+                // Skip fading if fadeout is disabled for this shape
+                if (shapesFadeoutSeconds === 0) continue;
+                
+                const fadeStartTime = Math.max(0, (shapesFadeoutSeconds - 2) * 1000); // Start fading 2s before end
+                const fadeEndTime = shapesFadeoutSeconds * 1000; // Fully gone after specified time
+                const age = now - shape.createdAt;
+                
+                if (age >= fadeEndTime) {
+                    // Remove completely faded shapes
+                    _shapes.splice(i, 1);
+                    needsUpdate = true;
+                } else if (age >= fadeStartTime) {
+                    // Update opacity for fading shapes
+                    const fadeProgress = (age - fadeStartTime) / (fadeEndTime - fadeStartTime);
+                    const newOpacity = 1 - fadeProgress;
+                    if (Math.abs(shape.opacity - newOpacity) > 0.01) {
+                        shape.opacity = newOpacity;
+                        needsUpdate = true;
+                    }
+                }
+            }
+
+            if (needsUpdate) {
+                saveShapes();
+                renderOverlay();
+                // Update URL to remove faded shapes  
+                encodeToolStateToUrl();
+            }
+        }
+
+        function getContentContainer() { return window; }
+
+        function updateOverlayModeAndContainer() {
+            _overlayContainer = window;
+            _overlayMode = 'single';
+        }
+
+        function updateOverlayBounds() {
+            if (!_overlay) return;
+            if (_overlayContainer === window) {
+                _overlay.style.position = 'fixed';
+                _overlay.style.left = '0px';
+                _overlay.style.top = '0px';
+                _overlay.width = window.innerWidth;
+                _overlay.height = window.innerHeight;
+            } else {
+                const rect = _overlayContainer.getBoundingClientRect();
+                _overlay.style.position = 'fixed';
+                _overlay.style.left = rect.left + 'px';
+                _overlay.style.top = rect.top + 'px';
+                _overlay.width = Math.max(0, Math.floor(rect.width));
+                _overlay.height = Math.max(0, Math.floor(rect.height));
+            }
+            renderOverlay();
+        }
+
+        function containerScrollLeft() {
+            return (_overlayContainer === window) ? (window.scrollX || 0) : (_overlayContainer.scrollLeft || 0);
+        }
+        function containerScrollTop() {
+            return (_overlayContainer === window) ? (window.scrollY || 0) : (_overlayContainer.scrollTop || 0);
+        }
+
+        function toCanvasCoords(clientX, clientY) {
+            const rect = _overlay.getBoundingClientRect();
+            return { x: clientX - rect.left, y: clientY - rect.top };
+        }
+
+        function onPointerDown(e) {
+            const tool = document.body.dataset.tool;
+            if (tool === 'arrow') {
+                startDrawArrow(e);
+            } else if (tool === 'pen') {
+                startDrawPen(e);
+            } else if (tool === 'eraser') {
+                eraseAt(e);
+            } else if (tool === 'spotlight') {
+                startDrawSpotlight(e);
+            }
+        }
+
+        function onPointerMove(e) {
+            // Update cursor position
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _cursorX = pt.x;
+            _cursorY = pt.y;
+            
+            if (!_drawing) {
+                // Just update cursor position and re-render
+                if (_cursorVisible) {
+                    renderOverlay();
+                }
+                return;
+            }
+            
+            if (_drawing.type === 'pen') {
+                moveDrawPen(e);
+            } else if (_drawing.type === 'spotlight') {
+                moveDrawSpotlight(e);
+            } else {
+                moveDrawArrow(e);
+            }
+        }
+        
+        function onPointerEnter(e) {
+            _cursorVisible = document.body.dataset.tool !== 'none';
+            if (_cursorVisible) {
+                renderOverlay();
+            }
+        }
+        
+        function onPointerLeave(e) {
+            _cursorVisible = false;
+            renderOverlay();
+        }
+
+        function onPointerUp(e) {
+            if (!_drawing) return;
+            if (_drawing.type === 'pen') {
+                endDrawPen();
+            } else if (_drawing.type === 'spotlight') {
+                endDrawSpotlight();
+            } else {
+                endDrawArrow();
+            }
+        }
+
+        function startDrawArrow(e) {
+            if (document.body.dataset.tool !== 'arrow') return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _drawing = {
+                x1: pt.x + containerScrollLeft(),
+                y1: pt.y + containerScrollTop(),
+                x2: pt.x + containerScrollLeft(),
+                y2: pt.y + containerScrollTop(),
+                color: getArrowColor(),
+                width: getLineThickness()
+            };
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function moveDrawArrow(e) {
+            if (!_drawing) return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _drawing.x2 = pt.x + containerScrollLeft();
+            _drawing.y2 = pt.y + containerScrollTop();
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function endDrawArrow() {
+            if (!_drawing) return;
+            _shapes.push({ 
+                type: 'arrow', 
+                ..._drawing,
+                createdAt: Date.now(),
+                fadeoutTime: getFadeoutTime(),
+                opacity: 1.0
+            });
+            _drawing = null;
+            saveShapes();
+            renderOverlay();
+        }
+
+        function startDrawPen(e) {
+            if (document.body.dataset.tool !== 'pen') return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _drawing = {
+                type: 'pen',
+                points: [{
+                    x: pt.x + containerScrollLeft(),
+                    y: pt.y + containerScrollTop()
+                }],
+                color: getArrowColor(),
+                width: getLineThickness()
+            };
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function moveDrawPen(e) {
+            if (!_drawing || _drawing.type !== 'pen') return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _drawing.points.push({
+                x: pt.x + containerScrollLeft(),
+                y: pt.y + containerScrollTop()
+            });
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function endDrawPen() {
+            if (!_drawing || _drawing.type !== 'pen') return;
+            if (_drawing.points.length > 1) {
+                _shapes.push({ 
+                    ..._drawing,
+                    createdAt: Date.now(),
+                    fadeoutTime: getFadeoutTime(),
+                    opacity: 1.0
+                });
+            }
+            _drawing = null;
+            saveShapes();
+            renderOverlay();
+        }
+
+        function startDrawSpotlight(e) {
+            if (document.body.dataset.tool !== 'spotlight') return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _drawing = {
+                type: 'spotlight',
+                x: pt.x + containerScrollLeft(),
+                y: pt.y + containerScrollTop(),
+                radius: getLineThickness() * 20, // Use thickness to control spotlight size (bigger default)
+                color: getArrowColor()
+            };
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function moveDrawSpotlight(e) {
+            if (!_drawing || _drawing.type !== 'spotlight') return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            const dx = pt.x + containerScrollLeft() - _drawing.x;
+            const dy = pt.y + containerScrollTop() - _drawing.y;
+            _drawing.radius = Math.max(20, Math.sqrt(dx * dx + dy * dy)); // Minimum radius of 20
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function endDrawSpotlight() {
+            if (!_drawing || _drawing.type !== 'spotlight') return;
+            _shapes.push({ 
+                ..._drawing,
+                createdAt: Date.now(),
+                fadeoutTime: getFadeoutTime(),
+                opacity: 1.0
+            });
+            _drawing = null;
+            saveShapes();
+            renderOverlay();
+        }
+
+        function distPointToSegment(px, py, x1, y1, x2, y2) {
+            const dx = x2 - x1, dy = y2 - y1;
+            if (dx === 0 && dy === 0) return Math.hypot(px - x1, py - y1);
+            const t = Math.max(0, Math.min(1, ((px - x1) * dx + (py - y1) * dy) / (dx*dx + dy*dy)));
+            const cx = x1 + t * dx, cy = y1 + t * dy;
+            return Math.hypot(px - cx, py - cy);
+        }
+
+        function eraseAt(e) {
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            const x = pt.x + containerScrollLeft();
+            const y = pt.y + containerScrollTop();
+            const threshold = 10; // pixels
+            for (let i = _shapes.length - 1; i >= 0; i--) {
+                const s = _shapes[i];
+                if (s.type === 'arrow') {
+                    const d = distPointToSegment(x, y, s.x1, s.y1, s.x2, s.y2);
+                    if (d <= threshold) {
+                        _shapes.splice(i, 1);
+                        saveShapes();
+                        renderOverlay();
+                        break;
+                    }
+                } else if (s.type === 'pen' && s.points) {
+                    // Check if click is near any line segment in the pen stroke
+                    let minDist = Infinity;
+                    for (let j = 1; j < s.points.length; j++) {
+                        const d = distPointToSegment(x, y, s.points[j-1].x, s.points[j-1].y, s.points[j].x, s.points[j].y);
+                        minDist = Math.min(minDist, d);
+                    }
+                    if (minDist <= threshold) {
+                        _shapes.splice(i, 1);
+                        saveShapes();
+                        renderOverlay();
+                        break;
+                    }
+                }
+            }
+            e.preventDefault();
+        }
+
+        function drawArrow(ctx, x1, y1, x2, y2, color, width, opacity = 1.0) {
+            // Set opacity
+            const oldAlpha = ctx.globalAlpha;
+            ctx.globalAlpha = opacity;
+            
+            ctx.strokeStyle = color;
+            ctx.fillStyle = color;
+            ctx.lineWidth = width;
+            ctx.lineCap = 'round';
+            ctx.lineJoin = 'round';
+            
+            // Check if points are too close (initial state)
+            const dx = x2 - x1;
+            const dy = y2 - y1;
+            const distance = Math.sqrt(dx * dx + dy * dy);
+            
+            if (distance < 5) {
+                // Draw just a small arrowhead pointing down-right when first clicked
+                const defaultAngle = Math.PI / 4; // 45 degrees (down-right)
+                const headLength = Math.min(15 + width * 1.5, 25);
+                const headAngle = Math.PI / 6;
+                
+                // Calculate arrowhead points
+                const hx1 = x1 + headLength * Math.cos(defaultAngle - headAngle);
+                const hy1 = y1 + headLength * Math.sin(defaultAngle - headAngle);
+                const hx2 = x1 + headLength * Math.cos(defaultAngle + headAngle);
+                const hy2 = y1 + headLength * Math.sin(defaultAngle + headAngle);
+                
+                // Draw arrowhead only
+                ctx.beginPath();
+                ctx.moveTo(x1, y1);
+                ctx.lineTo(hx1, hy1);
+                ctx.lineTo(hx2, hy2);
+                ctx.closePath();
+                ctx.fill();
+            } else {
+                // Normal arrow drawing - head at x1,y1, tail at x2,y2
+                const angle = Math.atan2(y1 - y2, x1 - x2);
+                const headLength = Math.min(15 + width * 1.5, 25);
+                const headAngle = Math.PI / 6;
+                
+                // Calculate where the line should end (before the arrowhead)
+                const lineEndX = x1 - headLength * 0.8 * Math.cos(angle);
+                const lineEndY = y1 - headLength * 0.8 * Math.sin(angle);
+                
+                // Draw the line from tail to near the head
+                ctx.beginPath();
+                ctx.moveTo(x2, y2);
+                ctx.lineTo(lineEndX, lineEndY);
+                ctx.stroke();
+                
+                // Calculate arrowhead points
+                const hx1 = x1 - headLength * Math.cos(angle - headAngle);
+                const hy1 = y1 - headLength * Math.sin(angle - headAngle);
+                const hx2 = x1 - headLength * Math.cos(angle + headAngle);
+                const hy2 = y1 - headLength * Math.sin(angle + headAngle);
+                
+                // Draw arrowhead
+                ctx.beginPath();
+                ctx.moveTo(x1, y1);
+                ctx.lineTo(hx1, hy1);
+                ctx.lineTo(hx2, hy2);
+                ctx.closePath();
+                ctx.fill();
+            }
+            
+            // Restore opacity
+            ctx.globalAlpha = oldAlpha;
+        }
+
+        function drawPen(ctx, points, color, width, offX, offY, opacity = 1.0) {
+            if (!points || points.length < 2) return;
+            
+            // Set opacity
+            const oldAlpha = ctx.globalAlpha;
+            ctx.globalAlpha = opacity;
+            
+            ctx.strokeStyle = color;
+            ctx.lineWidth = width;
+            ctx.lineCap = 'round';
+            ctx.lineJoin = 'round';
+            ctx.beginPath();
+            ctx.moveTo(points[0].x - offX, points[0].y - offY);
+            for (let i = 1; i < points.length; i++) {
+                ctx.lineTo(points[i].x - offX, points[i].y - offY);
+            }
+            ctx.stroke();
+            
+            // Restore opacity
+            ctx.globalAlpha = oldAlpha;
+        }
+
+        function drawAllSpotlights(ctx, spotlights, offX, offY) {
+            if (!spotlights || spotlights.length === 0) return;
+            
+            ctx.save();
+            
+            // Calculate the overall opacity based on all spotlights
+            const maxOpacity = Math.max(...spotlights.map(s => s.opacity || 1.0));
+            
+            // Fill entire canvas with dark overlay
+            ctx.fillStyle = `rgba(0, 0, 0, ${0.7 * maxOpacity})`;
+            ctx.fillRect(0, 0, ctx.canvas.width, ctx.canvas.height);
+            
+            // Cut out completely transparent holes for all spotlights
+            ctx.globalCompositeOperation = 'destination-out';
+            ctx.fillStyle = 'rgba(0, 0, 0, 1)'; // Solid black to ensure complete removal
+            for (const spotlight of spotlights) {
+                ctx.beginPath();
+                ctx.arc(spotlight.x - offX, spotlight.y - offY, spotlight.radius, 0, 2 * Math.PI);
+                ctx.fill();
+            }
+            
+            ctx.restore();
+        }
+
+        function renderOverlay() {
+            if (!_overlay || !_overlayCtx) return;
+            _overlayCtx.clearRect(0, 0, _overlay.width, _overlay.height);
+            const offX = containerScrollLeft();
+            const offY = containerScrollTop();
+            // Draw non-spotlight shapes first
+            for (const s of _shapes) {
+                const opacity = s.opacity !== undefined ? s.opacity : 1.0;
+                if (s.type === 'arrow') {
+                    drawArrow(_overlayCtx, s.x1 - offX, s.y1 - offY, s.x2 - offX, s.y2 - offY, s.color || '#f00', s.width || 2, opacity);
+                } else if (s.type === 'pen') {
+                    drawPen(_overlayCtx, s.points, s.color || '#f00', s.width || 2, offX, offY, opacity);
+                }
+            }
+            // Draw current drawing (non-spotlight)
+            if (_drawing) {
+                if (_drawing.type === 'pen') {
+                    drawPen(_overlayCtx, _drawing.points, _drawing.color, _drawing.width, offX, offY);
+                } else if (_drawing.type !== 'spotlight') {
+                    drawArrow(_overlayCtx, _drawing.x1 - offX, _drawing.y1 - offY, _drawing.x2 - offX, _drawing.y2 - offY, _drawing.color, _drawing.width);
+                }
+            }
+            
+            // Collect all spotlights (existing + current drawing + cursor preview)
+            const spotlights = [];
+            
+            // Add existing spotlight shapes
+            for (const s of _shapes) {
+                if (s.type === 'spotlight') {
+                    spotlights.push({
+                        x: s.x,
+                        y: s.y, 
+                        radius: s.radius,
+                        opacity: s.opacity !== undefined ? s.opacity : 1.0
+                    });
+                }
+            }
+            
+            // Add current spotlight being drawn
+            if (_drawing && _drawing.type === 'spotlight') {
+                spotlights.push({
+                    x: _drawing.x,
+                    y: _drawing.y,
+                    radius: _drawing.radius,
+                    opacity: 1.0
+                });
+            }
+            
+            // Add cursor preview spotlight if tool is active
+            if (_cursorVisible && !_drawing) {
+                const tool = document.body.dataset.tool;
+                if (tool === 'spotlight') {
+                    const thickness = getLineThickness();
+                    const radius = thickness * 20;
+                    const cursorWorldX = _cursorX + containerScrollLeft();
+                    const cursorWorldY = _cursorY + containerScrollTop();
+                    spotlights.push({
+                        x: cursorWorldX,
+                        y: cursorWorldY,
+                        radius: radius,
+                        opacity: 0.8
+                    });
+                }
+            }
+            
+            // Draw all spotlights as a single overlay with multiple holes
+            drawAllSpotlights(_overlayCtx, spotlights, offX, offY);
+            
+            // Draw cursor indicators for non-spotlight tools
+            if (_cursorVisible && !_drawing) {
+                const tool = document.body.dataset.tool;
+                const color = getArrowColor();
+                const thickness = getLineThickness();
+                
+                if (tool !== 'spotlight') {
+                    _overlayCtx.save();
+                    _overlayCtx.fillStyle = color;
+                    _overlayCtx.globalAlpha = 0.7;
+                    
+                    if (tool === 'eraser') {
+                        // Draw eraser indicator
+                        _overlayCtx.strokeStyle = color;
+                        _overlayCtx.lineWidth = 2;
+                        _overlayCtx.beginPath();
+                        _overlayCtx.arc(_cursorX, _cursorY, 10, 0, 2 * Math.PI);
+                        _overlayCtx.stroke();
+                    } else {
+                        // Draw dot for pen/arrow
+                        _overlayCtx.beginPath();
+                        _overlayCtx.arc(_cursorX, _cursorY, thickness / 2, 0, 2 * Math.PI);
+                        _overlayCtx.fill();
+                    }
+                    
+                    _overlayCtx.restore();
+                }
+            }
+        }
+
+        function setOverlayActive(active) {
+            if (!_overlay) initOverlay();
+            _overlay.style.pointerEvents = active ? 'auto' : 'none';
+            _overlay.style.cursor = active ? 'none' : 'auto';
+            // Re-render to ensure visibility aligns with content
+            renderOverlay();
+        }
+
+        function initOverlay() {
+            if (_overlay) return;
+            updateOverlayModeAndContainer();
+            _overlay = document.createElement('canvas');
+            _overlay.className = 'draw-overlay';
+            _overlayCtx = _overlay.getContext('2d');
+            document.body.appendChild(_overlay);
+            updateOverlayBounds();
+            loadShapes();
+            renderOverlay();
+
+            // Events
+            _overlay.addEventListener('mousedown', onPointerDown);
+            _overlay.addEventListener('mousemove', onPointerMove);
+            _overlay.addEventListener('mouseenter', onPointerEnter);
+            _overlay.addEventListener('mouseleave', onPointerLeave);
+            document.addEventListener('mouseup', onPointerUp);
+            _overlay.addEventListener('touchstart', onPointerDown, { passive: false });
+            _overlay.addEventListener('touchmove', onPointerMove, { passive: false });
+            document.addEventListener('touchend', onPointerUp);
+
+            _overlayResizeHandler = () => updateOverlayBounds();
+            window.addEventListener('resize', _overlayResizeHandler);
+
+            _overlayScrollHandler = () => renderOverlay();
+            window.addEventListener('scroll', _overlayScrollHandler);
+            
+            // Start fade animation timer
+            _fadeTimer = setInterval(updateShapesFade, 100); // Update every 100ms for smooth fade
+        }
+
+        function rebindOverlayContainer() {
+            if (!_overlay) return;
+            // Remove old scroll handler
+            if (_overlayScrollHandler) { window.removeEventListener('scroll', _overlayScrollHandler); }
+            updateOverlayModeAndContainer();
+            updateOverlayBounds();
+            loadShapes();
+            renderOverlay();
+            _overlayScrollHandler = () => renderOverlay();
+            window.addEventListener('scroll', _overlayScrollHandler);
+        }
+
+        function teardownOverlay() {
+            if (!_overlay) return;
+            _overlay.removeEventListener('mousedown', onPointerDown);
+            _overlay.removeEventListener('mousemove', onPointerMove);
+            _overlay.removeEventListener('mouseenter', onPointerEnter);
+            _overlay.removeEventListener('mouseleave', onPointerLeave);
+            document.removeEventListener('mouseup', onPointerUp);
+            _overlay.removeEventListener('touchstart', onPointerDown);
+            _overlay.removeEventListener('touchmove', onPointerMove);
+            document.removeEventListener('touchend', onPointerUp);
+            if (_overlayResizeHandler) window.removeEventListener('resize', _overlayResizeHandler);
+            if (_overlayScrollHandler) {
+                if (_overlayContainer === window) {
+                    window.removeEventListener('scroll', _overlayScrollHandler);
+                } else if (_overlayContainer) {
+                    _overlayContainer.removeEventListener('scroll', _overlayScrollHandler);
+                }
+            }
+            if (_fadeTimer) {
+                clearInterval(_fadeTimer);
+                _fadeTimer = null;
+            }
+            if (_overlay.parentNode) _overlay.parentNode.removeChild(_overlay);
+            _overlay = null; _overlayCtx = null; _overlayContainer = null; _overlayResizeHandler = null; _overlayScrollHandler = null; _drawing = null;
+        }
+        
+        function teardownFileExplorer() {
+            const fe = document.querySelector('.file-explorer');
+            if (fe && fe.parentNode) fe.parentNode.removeChild(fe);
+        }
+
+        function escapeHtml(text) {
+            const div = document.createElement('div');
+            div.textContent = text;
+            return div.innerHTML;
+        }
+
+        function runCell(cellId){
+            const btn=document.querySelector('.run-btn[onclick*="'+cellId+'"]');
+            const output=document.getElementById('output-'+cellId);
+            if(btn){btn.textContent='⏳ running...';btn.disabled=true;}
+            if(output){output.classList.add('output-stale');}
+            fetch('/run/'+cellId,{method:'POST'}).then(r=>r.json()).then(data=>{
+                if(output){
+                    output.classList.remove('output-stale');
+                    let html='';
+                    if (data.stdout) {
+                    html += '<div class="cell-stdout"><pre class="stdout-text">'
+                        + escapeHtml(data.stdout)
+                        + '</pre></div>';
+                    }
+
+                    console.log('UV Logs:', data);
+                    if(data.stderr) {
+                        // Split UV logs from regular stderr
+                        const lines = data.stderr.split('\\n');
+                        let uvLogs = [];
+                        let regularLogs = [];
+                        let inUvSection = true;
+
+                        for (const line of lines) {
+                            if (inUvSection) {
+                                uvLogs.push(line);
+                                if (line.startsWith('Installed ')) {
+                                    inUvSection = false;
+                                }
+                            } else {
+                                regularLogs.push(line);
+                            }
+                        }
+                        
+
+                        // If we never found "Installed", treat it all as regular stderr
+                        if (inUvSection) {
+                            html+='<div class="cell-stderr">'+escapeHtml(data.stderr)+'</div>';
+                        } else {
+                            const uvLogsStr = uvLogs.join('\\n');
+                            const regularLogsStr = regularLogs.join('\\n').trim();
+
+                            if (uvLogsStr) {
+                                html+='<div class="uv-install-logs">';
+                                html+='<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>';
+                                html+='<div class="uv-logs-content" style="display: none;">'+escapeHtml(uvLogsStr)+'</div>';
+                                html+='</div>';
+                            }
+                            if (regularLogsStr) {
+                                html+='<div class="cell-stderr">'+escapeHtml(regularLogsStr)+'</div>';
+                            }
+                        }
+                    }
+                    output.innerHTML=html;
+                }
+                if(btn){btn.textContent='▶ run';btn.disabled=false;}
+            }).catch(e=>{
+                console.error('Run failed:',e);
+                if(output){output.classList.remove('output-stale');}
+                if(btn){btn.textContent='▶ run';btn.disabled=false;}
+            });
+        }
+
+        function copyCell(cellId){
+            // Try multiple selectors to find the code element
+            // Pygments generates .highlight > pre with spans, not wrapped in <code>
+            let codeElement = document.querySelector('#code-'+cellId+' .highlight pre');
+            if (!codeElement) {
+                codeElement = document.querySelector('#code-'+cellId+' pre');
+            }
+            if (!codeElement) {
+                codeElement = document.querySelector('#code-'+cellId+' code');
+            }
+            if (!codeElement) {
+                // Fallback to the code div itself
+                codeElement = document.getElementById('code-'+cellId);
+            }
+
+            const btn = document.querySelector('.copy-btn[onclick*="'+cellId+'"]');
+
+            if (!codeElement) {
+                console.error('Code element not found for cell:', cellId);
+                return;
+            }
+            if (!btn) {
+                console.error('Copy button not found for cell:', cellId);
+                return;
+            }
+
+            const codeText = codeElement.textContent;
+            
+            if (navigator.clipboard && navigator.clipboard.writeText) {
+                navigator.clipboard.writeText(codeText).then(function() {
+                    console.log('Clipboard copy successful');
+                    btn.textContent = '✓ Copied!';
+                    btn.classList.add('copied');
+                    setTimeout(function() {
+                        btn.textContent = 'Copy';
+                        btn.classList.remove('copied');
+                    }, 2000);
+                }).catch(function(err) {
+                    console.warn('Clipboard copy failed:', err);
+                    fallbackCopy();
+                });
+            } else {
+                console.log('Using fallback copy method');
+                fallbackCopy();
+            }
+            
+            function fallbackCopy() {
+                const textarea = document.createElement('textarea');
+                textarea.value = codeText;
+                textarea.style.position = 'absolute';
+                textarea.style.left = '-9999px';
+                document.body.appendChild(textarea);
+                textarea.select();
+                try {
+                    const success = document.execCommand('copy');
+                    console.log('Fallback copy success:', success);
+                    btn.textContent = '✓ Copied!';
+                    btn.classList.add('copied');
+                    setTimeout(function() {
+                        btn.textContent = 'Copy';
+                        btn.classList.remove('copied');
+                    }, 2000);
+                } catch (err) {
+                    console.error('Fallback copy failed:', err);
+                    btn.textContent = 'Copy failed';
+                    setTimeout(function() {
+                        btn.textContent = 'Copy';
+                    }, 2000);
+                }
+                document.body.removeChild(textarea);
+            }
+        }
+
+        // // Live reload functionality (robust SSE handling)
+        // (function(){
+        //     if (!('EventSource' in window)) {
+        //         console.warn('SSE not supported in this browser');
+        //         return;
+        //     }
+        //     let source = new EventSource('/events');
+        //     let isOpen = false;
+        //     source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
+        //     source.onmessage = function(e){
+        //         const msg=(e.data||'').trim(); if(!msg) return;
+        //         console.log('SSE message:', msg);
+        //         if (msg==='reload' || msg==='incremental') { location.reload(); }
+        //         // Ignore 'loading' to avoid premature reload loops
+        //     };
+        //     source.onerror = function(e){
+        //         // Let EventSource auto-reconnect instead of forcing a reload
+        //         if (isOpen) console.warn('SSE error after open, retrying...', e);
+        //     };
+        //     window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
+        // })();
+
+
+        document.addEventListener('DOMContentLoaded', function() {
+            // Capture initial cell states before any modifications
+            captureInitialCellStates();
+            
+            updateThemeIcon();
+            updateUiMenu();
+            updateUiDebug();
+            const widgetsEnabled = (document.documentElement.getAttribute('data-widgets') || 'on') === 'on';
+            if (widgetsEnabled) {
+            initMinimap();
+            initFileExplorer();
+            initTools();
+            initOverlay();
+            initStatusWidget();
+            initializeWidgetVisibility();
+            layoutWidgetsStackedBottomRight();
+            window.addEventListener('resize', layoutWidgetsStackedBottomRight);
+            }
+
+            // Apply deep-link selection if present
+            applyLocationFromUrl();
+            updateStateIndicator();
+            
+            // Apply cell states from URL immediately
+            const url = new URL(window.location.href);
+            const encodedCellStates = url.searchParams.get('cells');
+            if (encodedCellStates) {
+                console.log('Applying cell states from URL...');
+                const cellStates = decodeCellStatesFromUrl(encodedCellStates);
+                
+                // Use requestAnimationFrame to ensure DOM is ready
+                requestAnimationFrame(() => {
+                    applyCellStatesFromUrl(cellStates);
+                    
+                    // Clear initialization flag after cell states are applied
+                    if (typeof _isInitializing !== 'undefined') {
+                        _isInitializing = false;
+                    }
+                });
+            } else {
+                // Clear initialization flag even if no cell states
+                if (typeof _isInitializing !== 'undefined') {
+                    requestAnimationFrame(() => {
+                        _isInitializing = false;
+                    });
+                }
+            }
+
+            // Bind drag selection on line numbers
+            document.addEventListener('mousedown', onLineNumberMouseDown);
+            document.addEventListener('mousemove', onDocMouseMove);
+            document.addEventListener('mouseup', onDocMouseUp);
+
+            // Add ESC key handler to exit tools
+            document.addEventListener('keydown', function(e) {
+                if (e.key === 'Escape' || e.keyCode === 27) {
+                    const currentTool = document.body.dataset.tool;
+                    if (currentTool && currentTool !== 'none') {
+                        // Deactivate the current tool
+                        window.setActiveTool('none');
+                    }
+                    // Also clear any active line selection
+                    clearSelection(true);
+                }
+            });
+        });
+
+        function updateStateIndicator() {
+            try {
+                const el = document.getElementById('status-widget');
+                if (!el) return;
+                const tool = document.body.dataset.tool || 'none';
+                if (tool && tool !== 'none') {
+                    el.textContent = `tool: ${tool} — Esc`;
+                    return;
+                }
+                if (_selection) {
+                    const t = _selection.a === _selection.b ? `L${_selection.a}` : `L${_selection.a}-${_selection.b}`;
+                    el.textContent = `selected: ${t} — Esc`;
+                    return;
+                }
+                el.textContent = 'ready — Esc';
+            } catch (_) {}
+        }
+    </script>
+</head>
+
+
+<body>
+    <div class="controls">
+        <div class="controls-buttons">
+            
+            <a href="index.html" class="back-button">← back</a>
+            
+            <div class="theme-toggle" onclick="toggleTheme()">light</div>
+            <div class="reset-toggle" onclick="resetLayout()">reset</div>
+            <div class="menu-button" onclick="toggleMenu()">
+                menu ▼
+                <div class="menu-dropdown">
+                    <div class="menu-item" onclick="setUiTheme('default')">
+                        <span class="menu-checkbox" id="checkbox-ui-default">☑</span> Theme: default
+                    </div>
+                    <div class="menu-item" onclick="setUiTheme('none')">
+                        <span class="menu-checkbox" id="checkbox-ui-none">☐</span> Theme: none
+                    </div>
+                    <div class="menu-item" onclick="setUiTheme('monocolor')">
+                        <span class="menu-checkbox" id="checkbox-ui-monocolor">☐</span> Theme: monocolor
+                    </div>
+                    <div class="menu-item" onclick="toggleWidget('tools')">
+                        <span class="menu-checkbox" id="checkbox-tools">☐</span> Tools
+                    </div>
+                    <div class="menu-item" onclick="toggleWidget('file-explorer')">
+                        <span class="menu-checkbox" id="checkbox-file-explorer">☐</span> File Explorer
+                    </div>
+                    <div class="menu-item" onclick="toggleWidget('minimap')">
+                        <span class="menu-checkbox" id="checkbox-minimap">☐</span> Table of Contents
+                    </div>
+                    <div class="menu-item" onclick="toggleWidget('status')">
+                        <span class="menu-checkbox" id="checkbox-status">☑</span> Status Indicator
+                    </div>
+                </div>
+            </div>
+        </div>
+    </div>
+    
+    <div class="system-info">
+        <div class="system-info-header">Generated on:</div>
+        <div class="system-info-content">
+            Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
+        </div>
+    </div>
+    
+    <div class="main-content">
+        <h1>Deformable DETR Multi-Scale Deformable Attention Benchmarks - Aggregated Results</h1>
+<p>This document combines benchmark results from multiple Deformable DETR implementations.</p>
+<h2>Combined Summary and Visualization</h2>
+<div class="artifact-preview">
+<?xml version='1.0' encoding='utf-8'?>
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:ns2="http://creativecommons.org/ns#" xmlns:ns4="http://www.w3.org/1999/xlink" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" width="864pt" height="576pt" viewBox="0 0 864 576" version="1.1">
+ <metadata>
+  <rdf:RDF>
+   <ns2:Work>
+    <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+    <dc:date>2025-10-31T20:14:23.345627</dc:date>
+    <dc:format>image/svg+xml</dc:format>
+    <dc:creator>
+     <ns2:Agent>
+      <dc:title>Matplotlib v3.10.7, https://matplotlib.org/</dc:title>
+     </ns2:Agent>
+    </dc:creator>
+   </ns2:Work>
+  </rdf:RDF>
+ </metadata>
+ <defs>
+  <style type="text/css">*{stroke-linejoin: round; stroke-linecap: butt}</style>
+ </defs>
+ <g id="figure--latency" class="figure">
+  <g id="patch_1">
+   <path d="M 0 576  L 864 576  L 864 0  L 0 0  L 0 576  z " style="fill: none" />
+  </g>
+  <g id="axes--1" class="axes">
+   <g id="patch_2">
+    <path d="M 47.72 425.105974  L 824.19299 425.105974  L 824.19299 26.88  L 47.72 26.88  L 47.72 425.105974  z " style="fill: none" />
+   </g>
+   <g id="matplotlib.axis_1">
+    <g id="xtick_1">
+     <g id="grid-x--1" class="grid grid-x">
+      <path d="M 83.014227 425.105974  L 83.014227 26.88  " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_1">
+      <defs>
+       <path id="mafb3703e5b" d="M 0 0  L 0 3.5  " style="stroke: #000000; stroke-width: 0.8" />
+      </defs>
+      <g>
+       <use ns4:href="#mafb3703e5b" x="83.014227" y="425.105974" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_1">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(28.96641 549.280197) rotate(-45)">cuda_B1_Q100_H8_E256_L4_P4</text>
+     </g>
+    </g>
+    <g id="xtick_2">
+     <g id="grid-x--2" class="grid grid-x">
+      <path d="M 318.309072 425.105974  L 318.309072 26.88  " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_2">
+      <g>
+       <use ns4:href="#mafb3703e5b" x="318.309072" y="425.105974" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_2">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(264.261255 549.280197) rotate(-45)">cuda_B1_Q300_H8_E256_L4_P4</text>
+     </g>
+    </g>
+    <g id="xtick_3">
+     <g id="grid-x--3" class="grid grid-x">
+      <path d="M 553.603918 425.105974  L 553.603918 26.88  " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_3">
+      <g>
+       <use ns4:href="#mafb3703e5b" x="553.603918" y="425.105974" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_3">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(499.556101 549.280197) rotate(-45)">cuda_B2_Q100_H8_E256_L4_P4</text>
+     </g>
+    </g>
+    <g id="xtick_4">
+     <g id="grid-x--4" class="grid grid-x">
+      <path d="M 788.898763 425.105974  L 788.898763 26.88  " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_4">
+      <g>
+       <use ns4:href="#mafb3703e5b" x="788.898763" y="425.105974" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_4">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(734.850946 549.280197) rotate(-45)">cuda_B2_Q300_H8_E256_L4_P4</text>
+     </g>
+    </g>
+    <g id="label--x" class="xlabel">
+     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="435.956495" y="562.545859" transform="rotate(-0 435.956495 562.545859)">Workload</text>
+    </g>
+   </g>
+   <g id="matplotlib.axis_2">
+    <g id="ytick_1">
+     <g id="grid-y--2" class="grid grid-y">
+      <path d="M 47.72 410.537508  L 824.19299 410.537508  " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_5">
+      <defs>
+       <path id="m0fca2865ba" d="M 0 0  L -3.5 0  " style="stroke: #000000; stroke-width: 0.8" />
+      </defs>
+      <g>
+       <use ns4:href="#m0fca2865ba" x="47.72" y="410.537508" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_5">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="414.336726" transform="rotate(-0 40.72 414.336726)">0.0</text>
+     </g>
+    </g>
+    <g id="ytick_2">
+     <g id="grid-y--3" class="grid grid-y">
+      <path d="M 47.72 365.072599  L 824.19299 365.072599  " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_6">
+      <g>
+       <use ns4:href="#m0fca2865ba" x="47.72" y="365.072599" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_6">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="368.871817" transform="rotate(-0 40.72 368.871817)">0.5</text>
+     </g>
+    </g>
+    <g id="ytick_3">
+     <g id="grid-y--4" class="grid grid-y">
+      <path d="M 47.72 319.60769  L 824.19299 319.60769  " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_7">
+      <g>
+       <use ns4:href="#m0fca2865ba" x="47.72" y="319.60769" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_7">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="323.406908" transform="rotate(-0 40.72 323.406908)">1.0</text>
+     </g>
+    </g>
+    <g id="ytick_4">
+     <g id="grid-y--5" class="grid grid-y">
+      <path d="M 47.72 274.142781  L 824.19299 274.142781  " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_8">
+      <g>
+       <use ns4:href="#m0fca2865ba" x="47.72" y="274.142781" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_8">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="277.941999" transform="rotate(-0 40.72 277.941999)">1.5</text>
+     </g>
+    </g>
+    <g id="ytick_5">
+     <g id="grid-y--6" class="grid grid-y">
+      <path d="M 47.72 228.677872  L 824.19299 228.677872  " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_9">
+      <g>
+       <use ns4:href="#m0fca2865ba" x="47.72" y="228.677872" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_9">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="232.47709" transform="rotate(-0 40.72 232.47709)">2.0</text>
+     </g>
+    </g>
+    <g id="ytick_6">
+     <g id="grid-y--7" class="grid grid-y">
+      <path d="M 47.72 183.212963  L 824.19299 183.212963  " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_10">
+      <g>
+       <use ns4:href="#m0fca2865ba" x="47.72" y="183.212963" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_10">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="187.012181" transform="rotate(-0 40.72 187.012181)">2.5</text>
+     </g>
+    </g>
+    <g id="ytick_7">
+     <g id="grid-y--8" class="grid grid-y">
+      <path d="M 47.72 137.748054  L 824.19299 137.748054  " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_11">
+      <g>
+       <use ns4:href="#m0fca2865ba" x="47.72" y="137.748054" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_11">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="141.547272" transform="rotate(-0 40.72 141.547272)">3.0</text>
+     </g>
+    </g>
+    <g id="ytick_8">
+     <g id="grid-y--9" class="grid grid-y">
+      <path d="M 47.72 92.283145  L 824.19299 92.283145  " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_12">
+      <g>
+       <use ns4:href="#m0fca2865ba" x="47.72" y="92.283145" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_12">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="96.082363" transform="rotate(-0 40.72 96.082363)">3.5</text>
+     </g>
+    </g>
+    <g id="ytick_9">
+     <g id="grid-y--10" class="grid grid-y">
+      <path d="M 47.72 46.818236  L 824.19299 46.818236  " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_13">
+      <g>
+       <use ns4:href="#m0fca2865ba" x="47.72" y="46.818236" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_13">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="50.617454" transform="rotate(-0 40.72 50.617454)">4.0</text>
+     </g>
+    </g>
+    <g id="label--y" class="ylabel">
+     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="18.737188" y="225.992987" transform="rotate(-90 18.737188 225.992987)">Latency P50 (ms)</text>
+    </g>
+   </g>
+   <g id="series--hf-kernels-deformable-detr" class="series">
+    <path d="M 83.014227 407.004793  L 318.309072 406.123683  L 553.603918 406.242801  L 788.898763 406.163692  " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
+    <defs>
+     <path id="md7efaf3aec" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #1f77b4" />
+    </defs>
+    <g clip-path="url(#pb5c8282ea4)">
+     <use ns4:href="#md7efaf3aec" x="83.014227" y="407.004793" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="318.309072" y="406.123683" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="553.603918" y="406.242801" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="788.898763" y="406.163692" style="fill: #1f77b4; stroke: #1f77b4" />
+    </g>
+   </g>
+   <g id="series--torch-eager" class="series">
+    <path d="M 83.014227 101.985538  L 318.309072 46.108619  L 553.603918 44.981181  L 788.898763 44.993002  " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
+    <defs>
+     <path id="m9b8c54d372" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #ff7f0e" />
+    </defs>
+    <g clip-path="url(#pb5c8282ea4)">
+     <use ns4:href="#m9b8c54d372" x="83.014227" y="101.985538" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="318.309072" y="46.108619" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="553.603918" y="44.981181" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="788.898763" y="44.993002" style="fill: #ff7f0e; stroke: #ff7f0e" />
+    </g>
+   </g>
+   <g id="patch_3">
+    <path d="M 47.72 425.105974  L 47.72 26.88  " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
+   </g>
+   <g id="patch_4">
+    <path d="M 824.19299 425.105974  L 824.19299 26.88  " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
+   </g>
+   <g id="patch_5">
+    <path d="M 47.72 425.105974  L 824.19299 425.105974  " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
+   </g>
+   <g id="patch_6">
+    <path d="M 47.72 26.88  L 824.19299 26.88  " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
+   </g>
+   <g id="text_14">
+    <text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="435.956495" y="20.88" transform="rotate(-0 435.956495 20.88)">Attention Implementation Latency</text>
+   </g>
+   <g id="legend" class="legend">
+    <g id="patch_7">
+     <path d="M 54.72 64.7925  L 225.330938 64.7925  Q 227.330938 64.7925 227.330938 62.7925  L 227.330938 33.88  Q 227.330938 31.88 225.330938 31.88  L 54.72 31.88  Q 52.72 31.88 52.72 33.88  L 52.72 62.7925  Q 52.72 64.7925 54.72 64.7925  L 54.72 64.7925  z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
+    </g>
+    <g id="line2d_14">
+     <path d="M 56.72 39.978438  L 66.72 39.978438  L 76.72 39.978438  " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
+     <g>
+      <use ns4:href="#md7efaf3aec" x="66.72" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
+     </g>
+    </g>
+    <g id="legend-label--hf-kernels-deformable-detr" class="legend">
+     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.72" y="43.478438" transform="rotate(-0 84.72 43.478438)">hf_kernels_deformable_detr</text>
+    </g>
+    <g id="line2d_15">
+     <path d="M 56.72 54.934687  L 66.72 54.934687  L 76.72 54.934687  " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
+     <g>
+      <use ns4:href="#m9b8c54d372" x="66.72" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     </g>
+    </g>
+    <g id="legend-label--torch-eager" class="legend">
+     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.72" y="58.434687" transform="rotate(-0 84.72 58.434687)">torch_eager</text>
+    </g>
+   </g>
+  </g>
+ </g>
+ <defs>
+  <clipPath id="pb5c8282ea4">
+   <rect x="47.72" y="26.88" width="776.47299" height="398.225974" />
+  </clipPath>
+ </defs>
+</svg>
+</div>
+
+<div class="cell" id="cell-combine">
+<div class="cell-header">
+<span class="collapse-indicators">
+<span onclick="toggleCode('combine')" style="cursor: pointer;">▶ code</span> 
+<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
+ <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
+</span> | 
+Cell: combine | 4.34s
+ | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
+<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
+<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
+</div>
+<div id="code-combine" class="cell-code collapsed" data-lines="26">
+<div class="highlight-with-lines">
+<div class="line-numbers" id="lines-combine">
+<a class="line-number" data-cell="combine" data-line="1" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 1, true);">1</a>
+<a class="line-number" data-cell="combine" data-line="2" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 2, true);">2</a>
+<a class="line-number" data-cell="combine" data-line="3" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 3, true);">3</a>
+<a class="line-number" data-cell="combine" data-line="4" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 4, true);">4</a>
+<a class="line-number" data-cell="combine" data-line="5" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 5, true);">5</a>
+<a class="line-number" data-cell="combine" data-line="6" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 6, true);">6</a>
+<a class="line-number" data-cell="combine" data-line="7" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 7, true);">7</a>
+<a class="line-number" data-cell="combine" data-line="8" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 8, true);">8</a>
+<a class="line-number" data-cell="combine" data-line="9" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 9, true);">9</a>
+<a class="line-number" data-cell="combine" data-line="10" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 10, true);">10</a>
+<a class="line-number" data-cell="combine" data-line="11" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 11, true);">11</a>
+<a class="line-number" data-cell="combine" data-line="12" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 12, true);">12</a>
+<a class="line-number" data-cell="combine" data-line="13" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 13, true);">13</a>
+<a class="line-number" data-cell="combine" data-line="14" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 14, true);">14</a>
+<a class="line-number" data-cell="combine" data-line="15" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 15, true);">15</a>
+<a class="line-number" data-cell="combine" data-line="16" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 16, true);">16</a>
+<a class="line-number" data-cell="combine" data-line="17" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 17, true);">17</a>
+<a class="line-number" data-cell="combine" data-line="18" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 18, true);">18</a>
+<a class="line-number" data-cell="combine" data-line="19" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 19, true);">19</a>
+<a class="line-number" data-cell="combine" data-line="20" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 20, true);">20</a>
+<a class="line-number" data-cell="combine" data-line="21" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 21, true);">21</a>
+<a class="line-number" data-cell="combine" data-line="22" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 22, true);">22</a>
+<a class="line-number" data-cell="combine" data-line="23" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 23, true);">23</a>
+<a class="line-number" data-cell="combine" data-line="24" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 24, true);">24</a>
+<a class="line-number" data-cell="combine" data-line="25" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 25, true);">25</a>
+<a class="line-number" data-cell="combine" data-line="26" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 26, true);">26</a>
+</div>
+<div class="code-wrap">
+<div class="highlight"><pre><span></span><span class="c1"># /// script</span>
+<span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
+<span class="c1"># dependencies = [</span>
+<span class="c1">#     &quot;numpy&quot;,</span>
+<span class="c1">#     &quot;torch==2.8.0&quot;,</span>
+<span class="c1">#     &quot;kernels-benchmark-tools&quot;,</span>
+<span class="c1">#     &quot;matplotlib&quot;,</span>
+<span class="c1"># ]</span>
+<span class="c1">#</span>
+<span class="c1"># [tool.uv.sources]</span>
+<span class="c1"># kernels-benchmark-tools = { path = &quot;../../../../../tools&quot;, editable = true }</span>
+<span class="c1"># ///</span>
+<span class="kn">from</span><span class="w"> </span><span class="nn">kernels_benchmark_tools.core.visuals</span><span class="w"> </span><span class="kn">import</span> <span class="n">generate_combined_results</span>
+
+<span class="c1"># Map display names to uvnote environment variables</span>
+<span class="n">cache_env_map</span> <span class="o">=</span> <span class="p">{</span>
+    <span class="s2">&quot;HF Kernels Deformable DETR&quot;</span><span class="p">:</span> <span class="s2">&quot;UVNOTE_FILE_HF_KERNELS_DEFORMABLE_DETR_BENCHMARK&quot;</span><span class="p">,</span>
+    <span class="s2">&quot;PyTorch Deformable DETR&quot;</span><span class="p">:</span> <span class="s2">&quot;UVNOTE_FILE_TORCH_DEFORMABLE_DETR_BENCHMARK&quot;</span><span class="p">,</span>
+<span class="p">}</span>
+
+<span class="c1"># Generate combined results with visualization</span>
+<span class="n">generate_combined_results</span><span class="p">(</span>
+    <span class="n">cache_env_map</span><span class="o">=</span><span class="n">cache_env_map</span><span class="p">,</span>
+    <span class="n">output_filename</span><span class="o">=</span><span class="s2">&quot;deformable_detr.jsonl&quot;</span><span class="p">,</span>
+    <span class="n">svg_filename</span><span class="o">=</span><span class="s2">&quot;latency.svg&quot;</span>
+<span class="p">)</span>
+</pre></div>
+
+<div class="code-line-highlight" id="line-highlight-combine"></div>
+</div>
+</div>
+</div>
+<div id="output-combine" class="cell-output">
+<div class="cell-stdout"><pre class="stdout-text">======================================================================
+LOADING BENCHMARK DATA
+======================================================================
+✓ HF Kernels Deformable DETR    : /__w/kernels-benchmarks/kernels-benchmarks/benches/deformable_detr/impls/.uvnote/cache/8ab95d7f8f4c6a375b95806e646e4e6f12f0749960d319cf7587215b378ccfa9
+✓ PyTorch Deformable DETR       : /__w/kernels-benchmarks/kernels-benchmarks/benches/deformable_detr/impls/.uvnote/cache/9c0a40cf66719a0b460ebb0ca3b41bcaf6c5486905bbf2045a65be2710694dfa
+
+  ✓ Found HF Kernels Deformable DETR
+     Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/deformable_detr/impls/.uvnote/cache/8ab95d7f8f4c6a375b95806e646e4e6f12f0749960d319cf7587215b378ccfa9/deformable_detr.jsonl
+  ✓ Found PyTorch Deformable DETR
+     Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/deformable_detr/impls/.uvnote/cache/9c0a40cf66719a0b460ebb0ca3b41bcaf6c5486905bbf2045a65be2710694dfa/deformable_detr.jsonl
+
+======================================================================
+Summary: 2 found, 0 skipped, 0 missing
+======================================================================
+
+COMBINED BENCHMARK SUMMARY
+
+impl                     wl                  p50(ms)  ok
+hf_kernels_deformable_detr cuda_B1_Q100_H8_E256_L4_P4     0.04  True
+hf_kernels_deformable_detr cuda_B1_Q300_H8_E256_L4_P4     0.05  True
+hf_kernels_deformable_detr cuda_B2_Q100_H8_E256_L4_P4     0.05  True
+hf_kernels_deformable_detr cuda_B2_Q300_H8_E256_L4_P4     0.05  True
+torch_eager              cuda_B1_Q100_H8_E256_L4_P4     3.39  True
+torch_eager              cuda_B1_Q300_H8_E256_L4_P4     4.01  True
+torch_eager              cuda_B2_Q100_H8_E256_L4_P4     4.02  True
+torch_eager              cuda_B2_Q300_H8_E256_L4_P4     4.02  True
+
+GENERATING COMBINED VISUALIZATION
+
+Loaded 8 records
+✓ Visualization saved as latency.svg
+Saved latency.png
+✓ Visualization saved as latency.svg
+✓ SVG visualization ready!
+
+ANALYSIS COMPLETE
+Total implementations analyzed: 2
+
+Implementations included:
+  ✓ HF Kernels Deformable DETR
+  ✓ PyTorch Deformable DETR
+</pre></div>
+<div class="uv-install-logs" id="uv-logs-combine">
+<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
+<div class="uv-logs-content" style="display: none;">
+Installed 37 packages in 216ms
+</div>
+</div>
+<div class="cell-artifacts">
+<h4>Artifacts:</h4>
+<a href="artifacts/combine/latency.svg" class="artifact" target="_blank">latency.svg</a>
+<div class="artifact-preview">
+<?xml version='1.0' encoding='utf-8'?>
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:ns2="http://creativecommons.org/ns#" xmlns:ns4="http://www.w3.org/1999/xlink" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" width="864pt" height="576pt" viewBox="0 0 864 576" version="1.1">
+ <metadata>
+  <rdf:RDF>
+   <ns2:Work>
+    <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+    <dc:date>2025-10-31T20:14:23.345627</dc:date>
+    <dc:format>image/svg+xml</dc:format>
+    <dc:creator>
+     <ns2:Agent>
+      <dc:title>Matplotlib v3.10.7, https://matplotlib.org/</dc:title>
+     </ns2:Agent>
+    </dc:creator>
+   </ns2:Work>
+  </rdf:RDF>
+ </metadata>
+ <defs>
+  <style type="text/css">*{stroke-linejoin: round; stroke-linecap: butt}</style>
+ </defs>
+ <g id="figure--latency" class="figure">
+  <g id="patch_1">
+   <path d="M 0 576  L 864 576  L 864 0  L 0 0  L 0 576  z " style="fill: none" />
+  </g>
+  <g id="axes--1" class="axes">
+   <g id="patch_2">
+    <path d="M 47.72 425.105974  L 824.19299 425.105974  L 824.19299 26.88  L 47.72 26.88  L 47.72 425.105974  z " style="fill: none" />
+   </g>
+   <g id="matplotlib.axis_1">
+    <g id="xtick_1">
+     <g id="grid-x--1" class="grid grid-x">
+      <path d="M 83.014227 425.105974  L 83.014227 26.88  " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_1">
+      <defs>
+       <path id="mafb3703e5b" d="M 0 0  L 0 3.5  " style="stroke: #000000; stroke-width: 0.8" />
+      </defs>
+      <g>
+       <use ns4:href="#mafb3703e5b" x="83.014227" y="425.105974" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_1">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(28.96641 549.280197) rotate(-45)">cuda_B1_Q100_H8_E256_L4_P4</text>
+     </g>
+    </g>
+    <g id="xtick_2">
+     <g id="grid-x--2" class="grid grid-x">
+      <path d="M 318.309072 425.105974  L 318.309072 26.88  " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_2">
+      <g>
+       <use ns4:href="#mafb3703e5b" x="318.309072" y="425.105974" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_2">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(264.261255 549.280197) rotate(-45)">cuda_B1_Q300_H8_E256_L4_P4</text>
+     </g>
+    </g>
+    <g id="xtick_3">
+     <g id="grid-x--3" class="grid grid-x">
+      <path d="M 553.603918 425.105974  L 553.603918 26.88  " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_3">
+      <g>
+       <use ns4:href="#mafb3703e5b" x="553.603918" y="425.105974" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_3">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(499.556101 549.280197) rotate(-45)">cuda_B2_Q100_H8_E256_L4_P4</text>
+     </g>
+    </g>
+    <g id="xtick_4">
+     <g id="grid-x--4" class="grid grid-x">
+      <path d="M 788.898763 425.105974  L 788.898763 26.88  " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_4">
+      <g>
+       <use ns4:href="#mafb3703e5b" x="788.898763" y="425.105974" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_4">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(734.850946 549.280197) rotate(-45)">cuda_B2_Q300_H8_E256_L4_P4</text>
+     </g>
+    </g>
+    <g id="label--x" class="xlabel">
+     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="435.956495" y="562.545859" transform="rotate(-0 435.956495 562.545859)">Workload</text>
+    </g>
+   </g>
+   <g id="matplotlib.axis_2">
+    <g id="ytick_1">
+     <g id="grid-y--2" class="grid grid-y">
+      <path d="M 47.72 410.537508  L 824.19299 410.537508  " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_5">
+      <defs>
+       <path id="m0fca2865ba" d="M 0 0  L -3.5 0  " style="stroke: #000000; stroke-width: 0.8" />
+      </defs>
+      <g>
+       <use ns4:href="#m0fca2865ba" x="47.72" y="410.537508" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_5">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="414.336726" transform="rotate(-0 40.72 414.336726)">0.0</text>
+     </g>
+    </g>
+    <g id="ytick_2">
+     <g id="grid-y--3" class="grid grid-y">
+      <path d="M 47.72 365.072599  L 824.19299 365.072599  " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_6">
+      <g>
+       <use ns4:href="#m0fca2865ba" x="47.72" y="365.072599" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_6">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="368.871817" transform="rotate(-0 40.72 368.871817)">0.5</text>
+     </g>
+    </g>
+    <g id="ytick_3">
+     <g id="grid-y--4" class="grid grid-y">
+      <path d="M 47.72 319.60769  L 824.19299 319.60769  " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_7">
+      <g>
+       <use ns4:href="#m0fca2865ba" x="47.72" y="319.60769" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_7">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="323.406908" transform="rotate(-0 40.72 323.406908)">1.0</text>
+     </g>
+    </g>
+    <g id="ytick_4">
+     <g id="grid-y--5" class="grid grid-y">
+      <path d="M 47.72 274.142781  L 824.19299 274.142781  " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_8">
+      <g>
+       <use ns4:href="#m0fca2865ba" x="47.72" y="274.142781" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_8">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="277.941999" transform="rotate(-0 40.72 277.941999)">1.5</text>
+     </g>
+    </g>
+    <g id="ytick_5">
+     <g id="grid-y--6" class="grid grid-y">
+      <path d="M 47.72 228.677872  L 824.19299 228.677872  " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_9">
+      <g>
+       <use ns4:href="#m0fca2865ba" x="47.72" y="228.677872" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_9">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="232.47709" transform="rotate(-0 40.72 232.47709)">2.0</text>
+     </g>
+    </g>
+    <g id="ytick_6">
+     <g id="grid-y--7" class="grid grid-y">
+      <path d="M 47.72 183.212963  L 824.19299 183.212963  " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_10">
+      <g>
+       <use ns4:href="#m0fca2865ba" x="47.72" y="183.212963" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_10">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="187.012181" transform="rotate(-0 40.72 187.012181)">2.5</text>
+     </g>
+    </g>
+    <g id="ytick_7">
+     <g id="grid-y--8" class="grid grid-y">
+      <path d="M 47.72 137.748054  L 824.19299 137.748054  " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_11">
+      <g>
+       <use ns4:href="#m0fca2865ba" x="47.72" y="137.748054" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_11">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="141.547272" transform="rotate(-0 40.72 141.547272)">3.0</text>
+     </g>
+    </g>
+    <g id="ytick_8">
+     <g id="grid-y--9" class="grid grid-y">
+      <path d="M 47.72 92.283145  L 824.19299 92.283145  " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_12">
+      <g>
+       <use ns4:href="#m0fca2865ba" x="47.72" y="92.283145" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_12">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="96.082363" transform="rotate(-0 40.72 96.082363)">3.5</text>
+     </g>
+    </g>
+    <g id="ytick_9">
+     <g id="grid-y--10" class="grid grid-y">
+      <path d="M 47.72 46.818236  L 824.19299 46.818236  " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_13">
+      <g>
+       <use ns4:href="#m0fca2865ba" x="47.72" y="46.818236" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_13">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="50.617454" transform="rotate(-0 40.72 50.617454)">4.0</text>
+     </g>
+    </g>
+    <g id="label--y" class="ylabel">
+     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="18.737188" y="225.992987" transform="rotate(-90 18.737188 225.992987)">Latency P50 (ms)</text>
+    </g>
+   </g>
+   <g id="series--hf-kernels-deformable-detr" class="series">
+    <path d="M 83.014227 407.004793  L 318.309072 406.123683  L 553.603918 406.242801  L 788.898763 406.163692  " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
+    <defs>
+     <path id="md7efaf3aec" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #1f77b4" />
+    </defs>
+    <g clip-path="url(#pb5c8282ea4)">
+     <use ns4:href="#md7efaf3aec" x="83.014227" y="407.004793" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="318.309072" y="406.123683" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="553.603918" y="406.242801" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="788.898763" y="406.163692" style="fill: #1f77b4; stroke: #1f77b4" />
+    </g>
+   </g>
+   <g id="series--torch-eager" class="series">
+    <path d="M 83.014227 101.985538  L 318.309072 46.108619  L 553.603918 44.981181  L 788.898763 44.993002  " clip-path="url(#pb5c8282ea4)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
+    <defs>
+     <path id="m9b8c54d372" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #ff7f0e" />
+    </defs>
+    <g clip-path="url(#pb5c8282ea4)">
+     <use ns4:href="#m9b8c54d372" x="83.014227" y="101.985538" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="318.309072" y="46.108619" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="553.603918" y="44.981181" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="788.898763" y="44.993002" style="fill: #ff7f0e; stroke: #ff7f0e" />
+    </g>
+   </g>
+   <g id="patch_3">
+    <path d="M 47.72 425.105974  L 47.72 26.88  " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
+   </g>
+   <g id="patch_4">
+    <path d="M 824.19299 425.105974  L 824.19299 26.88  " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
+   </g>
+   <g id="patch_5">
+    <path d="M 47.72 425.105974  L 824.19299 425.105974  " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
+   </g>
+   <g id="patch_6">
+    <path d="M 47.72 26.88  L 824.19299 26.88  " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
+   </g>
+   <g id="text_14">
+    <text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="435.956495" y="20.88" transform="rotate(-0 435.956495 20.88)">Attention Implementation Latency</text>
+   </g>
+   <g id="legend" class="legend">
+    <g id="patch_7">
+     <path d="M 54.72 64.7925  L 225.330938 64.7925  Q 227.330938 64.7925 227.330938 62.7925  L 227.330938 33.88  Q 227.330938 31.88 225.330938 31.88  L 54.72 31.88  Q 52.72 31.88 52.72 33.88  L 52.72 62.7925  Q 52.72 64.7925 54.72 64.7925  L 54.72 64.7925  z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
+    </g>
+    <g id="line2d_14">
+     <path d="M 56.72 39.978438  L 66.72 39.978438  L 76.72 39.978438  " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
+     <g>
+      <use ns4:href="#md7efaf3aec" x="66.72" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
+     </g>
+    </g>
+    <g id="legend-label--hf-kernels-deformable-detr" class="legend">
+     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.72" y="43.478438" transform="rotate(-0 84.72 43.478438)">hf_kernels_deformable_detr</text>
+    </g>
+    <g id="line2d_15">
+     <path d="M 56.72 54.934687  L 66.72 54.934687  L 76.72 54.934687  " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
+     <g>
+      <use ns4:href="#m9b8c54d372" x="66.72" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     </g>
+    </g>
+    <g id="legend-label--torch-eager" class="legend">
+     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.72" y="58.434687" transform="rotate(-0 84.72 58.434687)">torch_eager</text>
+    </g>
+   </g>
+  </g>
+ </g>
+ <defs>
+  <clipPath id="pb5c8282ea4">
+   <rect x="47.72" y="26.88" width="776.47299" height="398.225974" />
+  </clipPath>
+ </defs>
+</svg>
+</div>
+</div>
+</div>
+</div>
+    </div>
+    
+</body>
+</html>
\ No newline at end of file
diff --git a/deformable_detr/results/index.html b/deformable_detr/results/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..259f497868f81b516b1f0c893e4974cda430c731
--- /dev/null
+++ b/deformable_detr/results/index.html
@@ -0,0 +1,88 @@
+<!DOCTYPE html>
+<html>
+<head>
+  <meta charset='UTF-8'>
+  <meta name='viewport' content='width=device-width, initial-scale=1.0'>
+  <title>Index of /deformable_detr/results</title>
+  <style>
+    :root {
+      --bg-primary: #0a0a0a;
+      --bg-secondary: #121212;
+      --bg-tertiary: #181818;
+      --text-primary: #e0e0e0;
+      --text-secondary: #888888;
+      --text-link: #64b5f6;
+      --border-primary: #2a2a2a;
+    }
+    body {
+      font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
+      background: var(--bg-primary);
+      color: var(--text-primary);
+      margin: 0;
+      padding: 16px;
+      max-width: 900px;
+      margin: 0 auto;
+    }
+    .controls {
+      display: flex;
+      justify-content: flex-end;
+      margin-bottom: 1rem;
+    }
+    .back-button {
+      background: var(--bg-secondary);
+      border: 1px solid var(--border-primary);
+      padding: 8px 12px;
+      border-radius: 4px;
+      color: var(--text-secondary);
+      cursor: pointer;
+      font-size: 0.9rem;
+      text-decoration: none;
+      display: inline-block;
+    }
+    .back-button:hover {
+      color: var(--text-primary);
+      background: var(--bg-tertiary);
+    }
+    h1 {
+      font-size: 1.5em;
+      margin: 1rem 0;
+      color: var(--text-primary);
+      border-bottom: 1px solid var(--border-primary);
+      padding-bottom: 0.5rem;
+    }
+    ul {
+      list-style-type: none;
+      padding: 0;
+    }
+    li {
+      margin: 0;
+      border-bottom: 1px solid var(--border-primary);
+    }
+    li:last-child {
+      border-bottom: none;
+    }
+    a {
+      display: block;
+      padding: 0.75rem 0.5rem;
+      text-decoration: none;
+      color: var(--text-link);
+      transition: background 0.2s ease;
+    }
+    a:hover {
+      background: var(--bg-secondary);
+    }
+    .dir {
+      font-weight: 500;
+    }
+  </style>
+</head>
+<body>
+  <div class='controls'>
+    <a href='../index.html' class='back-button'>← back</a>
+  </div>
+  <h1>Index of /deformable_detr/results</h1>
+  <ul>
+    <li><a href='combined_results.html' class='file'>combined_results.html</a></li>
+  </ul>
+</body>
+</html>
\ No newline at end of file
diff --git a/flash_attn/impls/artifacts/benchmark/attention.jsonl b/flash_attn/impls/artifacts/benchmark/attention.jsonl
index de592afa82ec05256019431f6592e8e321594c40..d381f496ddfa4abddae090de1e302f3856ab3fc4 100644
--- a/flash_attn/impls/artifacts/benchmark/attention.jsonl
+++ b/flash_attn/impls/artifacts/benchmark/attention.jsonl
@@ -1,6 +1,6 @@
-{"ts": "2025-10-30T15:53:30Z", "run": "30999c763e9c4aa995d0df5078964128", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.9094910000158052, "p50": 0.9113720000186731, "p90": 0.9181919999718957, "mean": 0.9141214000010223, "iqr": 0.007780999965234514, "raw_times": [0.9104110000066612, 0.9094910000158052, 0.9113720000186731, 0.9181919999718957, 0.9211409999920761], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.9259819999556385, "peak_bytes": 295567360, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:30Z", "run": "30999c763e9c4aa995d0df5078964128", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.9480720000283327, "p50": 0.9496129999888581, "p90": 0.9558429999856344, "mean": 0.952826599996115, "iqr": 0.00735100002202671, "raw_times": [0.9480720000283327, 0.9484919999636077, 0.9496129999888581, 0.9558429999856344, 0.962113000014142], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.9554529999604711, "peak_bytes": 304742400, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:30Z", "run": "30999c763e9c4aa995d0df5078964128", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0061439999731192, "p50": 1.0189639999680367, "p90": 1.0215840000000753, "mean": 1.017895999996199, "iqr": 0.0038299999687296804, "raw_times": [1.0189639999680367, 1.025034000008418, 1.0177540000313456, 1.0061439999731192, 1.0215840000000753], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.0171540000101231, "peak_bytes": 307494912, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:30Z", "run": "30999c763e9c4aa995d0df5078964128", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0146539999595916, "p50": 1.019383999960155, "p90": 1.0202839999919888, "mean": 1.018159799980367, "iqr": 0.004200999967451935, "raw_times": [1.0202839999919888, 1.0146539999595916, 1.0160830000245369, 1.0203939999655631, 1.019383999960155], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.0248149999938505, "peak_bytes": 311296000, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:30Z", "run": "30999c763e9c4aa995d0df5078964128", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.1737179999613545, "p50": 1.184327000032681, "p90": 1.1859380000487363, "mean": 1.186479800003326, "iqr": 0.010300000042207103, "raw_times": [1.1756380000065292, 1.1737179999613545, 1.1859380000487363, 1.184327000032681, 1.2127779999673294], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.1959679999904438, "peak_bytes": 315621376, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:30Z", "run": "30999c763e9c4aa995d0df5078964128", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.1702179999701912, "p50": 1.1838479999823903, "p90": 1.1906280000175684, "mean": 1.1843698000006952, "iqr": 0.016700999992735888, "raw_times": [1.1739270000248325, 1.1702179999701912, 1.1838479999823903, 1.1906280000175684, 1.2032280000084938], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.1880579999683505, "peak_bytes": 319946752, "ok": true, "absmax": 0.125, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.125, "mae": 0.000362396240234375, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-31T20:13:46Z", "run": "0cb1af490a594cbca21d4dd4012a3c10", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.208432000112225, "p50": 1.215130999980829, "p90": 1.2198710001030122, "mean": 1.215487200033749, "iqr": 0.006680000069536618, "raw_times": [1.2208109999392036, 1.208432000112225, 1.2198710001030122, 1.2131910000334756, 1.215130999980829], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2240119999660237, "peak_bytes": 295567360, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-31T20:13:46Z", "run": "0cb1af490a594cbca21d4dd4012a3c10", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.26713200006634, "p50": 1.2766830000146001, "p90": 1.277253000125711, "mean": 1.2749268000789016, "iqr": 0.004750000016429112, "raw_times": [1.277253000125711, 1.26713200006634, 1.2766830000146001, 1.281063000078575, 1.2725030001092819], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2717629999769997, "peak_bytes": 304742400, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-31T20:13:46Z", "run": "0cb1af490a594cbca21d4dd4012a3c10", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.2928539999847999, "p50": 1.3003640001443273, "p90": 1.3163240000721999, "mean": 1.3067478000721167, "iqr": 0.01689100008661626, "raw_times": [1.3003640001443273, 1.2928539999847999, 1.2994329999855836, 1.3163240000721999, 1.3247640001736727], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.3026630001604644, "peak_bytes": 307494912, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-31T20:13:46Z", "run": "0cb1af490a594cbca21d4dd4012a3c10", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.3232850001259067, "p50": 1.3295650001055037, "p90": 1.3361950000216893, "mean": 1.332684600038192, "iqr": 0.007890999995652237, "raw_times": [1.328304000026037, 1.3361950000216893, 1.3295650001055037, 1.3232850001259067, 1.3460739999118232], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.3245140000890387, "peak_bytes": 311296000, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-31T20:13:46Z", "run": "0cb1af490a594cbca21d4dd4012a3c10", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.4790479999646777, "p50": 1.4950690001569455, "p90": 1.4989779999723396, "mean": 1.4914904000306706, "iqr": 0.017840000055002747, "raw_times": [1.5032190001420531, 1.4950690001569455, 1.4790479999646777, 1.4811379999173369, 1.4989779999723396], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.5107090000583412, "peak_bytes": 315621376, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-31T20:13:46Z", "run": "0cb1af490a594cbca21d4dd4012a3c10", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.511368999899787, "p50": 1.5117090001695033, "p90": 1.512698999931672, "mean": 1.516499199988175, "iqr": 0.00113999999484804, "raw_times": [1.511368999899787, 1.512698999931672, 1.5117090001695033, 1.511558999936824, 1.5351600000030885], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.5183190000698232, "peak_bytes": 319946752, "ok": true, "absmax": 0.125, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.125, "mae": 0.0003566741943359375, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
diff --git a/flash_attn/impls/cells/benchmark.py b/flash_attn/impls/cells/benchmark.py
index 15f02e2ed444e10eba9708f3f69247414b6c962b..8f163bdd918898ced9e858cd4197a85572d7ec8e 100644
--- a/flash_attn/impls/cells/benchmark.py
+++ b/flash_attn/impls/cells/benchmark.py
@@ -4,7 +4,6 @@
 #     "numpy",
 #     "torch==2.8.0",
 #     "kernels-benchmark-tools",
-#     "kernels",
 # ]
 #
 # [tool.uv.sources]
@@ -13,19 +12,18 @@
 import torch
 import sys
 from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
-from kernels import get_kernel
 
-# Load the flash attention 3 kernel
-hf_kernels_flash_attn3 = get_kernel("kernels-community/flash-attn3")
 
-
-def hf_flash_attention3(query, key, value):
-    return hf_kernels_flash_attn3.flash_attn_func(query, key, value, causal=False)[0]
+def torch_flash(q, k, v):
+    qt, kt, vt = (x.transpose(1, 2).contiguous() for x in (q, k, v))
+    with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.FLASH_ATTENTION):
+        o = torch.nn.functional.scaled_dot_product_attention(qt, kt, vt)
+    return o.transpose(1, 2).contiguous()
 
 
 run_benchmark(
     kernel_type=KernelTypeEnum.ATTENTION,
-    impl_name="hf_kernels_flash_attn3",
-    impl_tags={"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"},
-    impl_func=hf_flash_attention3,
+    impl_name="torch_flash_ma",
+    impl_tags={"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"},
+    impl_func=torch_flash,
 )
\ No newline at end of file
diff --git a/flash_attn/impls/flash_attention.html b/flash_attn/impls/flash_attention.html
index 501ea20e924b7038a53903e7992899b1953d98eb..1852a8c0fb83365b1e619b7e38354ebd1d45d747 100644
--- a/flash_attn/impls/flash_attention.html
+++ b/flash_attn/impls/flash_attention.html
@@ -4110,7 +4110,7 @@ Cell: nv | 0.21s
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
 <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
-<a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/flash_attention.md" target="_blank" class="github-btn">GitHub</a>
+<a href="https://github.com/huggingface/kernels-benchmarks/blob/main/flash_attn/impls/flash_attention.md" target="_blank" class="github-btn">GitHub</a>
 </div>
 <div id="code-nv" class="cell-code" data-lines="3">
 <div class="code-wrap">
@@ -4123,7 +4123,7 @@ Cell: nv | 0.21s
 </div>
 </div>
 <div id="output-nv" class="cell-output">
-<div class="cell-stdout"><pre class="stdout-text">Thu Oct 30 15:52:36 2025       
+<div class="cell-stdout"><pre class="stdout-text">Fri Oct 31 20:13:43 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -4132,7 +4132,7 @@ Cell: nv | 0.21s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   30C    P0             75W /  350W |       0MiB /  46068MiB |     11%      Default |
+| N/A   43C    P0             83W /  350W |       0MiB /  46068MiB |     11%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -4154,13 +4154,13 @@ Cell: nv | 0.21s
 <span class="collapse-indicators">
 <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span> 
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
- <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
+ <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> | 
-Cell: benchmark | 7.50s
+Cell: benchmark | 3.87s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
-<a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/flash_attention.md" target="_blank" class="github-btn">GitHub</a>
+<a href="https://github.com/huggingface/kernels-benchmarks/blob/main/flash_attn/impls/flash_attention.md" target="_blank" class="github-btn">GitHub</a>
 </div>
 <div id="code-benchmark" class="cell-code" data-lines="29">
 <div class="code-wrap">
@@ -4207,29 +4207,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L128_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.587ms       102.23%       3.587ms       3.587ms             1  
-                                         torch_flash_ma         7.11%     370.236us        47.42%       2.468ms       2.468ms       0.000us         0.00%       3.549ms       3.549ms             1  
-                     aten::scaled_dot_product_attention         0.85%      44.391us         4.44%     231.334us      77.111us       0.000us         0.00%       2.791ms     930.498us             3  
-              aten::_scaled_dot_product_flash_attention         0.51%      26.381us         3.59%     186.943us      62.314us       0.000us         0.00%       2.791ms     930.498us             3  
-                         aten::_flash_attention_forward         0.76%      39.658us         2.57%     134.002us      44.667us       2.791ms        79.55%       2.791ms     930.498us             3  
-void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       2.791ms        79.55%       2.791ms     930.498us             3  
-                                       aten::contiguous         0.30%      15.641us        34.37%       1.789ms     149.098us       0.000us         0.00%     757.697us      63.141us            12  
-                                            aten::clone         0.74%      38.596us        34.07%       1.774ms     147.794us       0.000us         0.00%     757.697us      63.141us            12  
-                                            aten::copy_         1.78%      92.553us        31.63%       1.647ms     137.218us     717.505us        20.45%     757.697us      63.141us            12  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     717.505us        20.45%     717.505us      59.792us            12  
-                                Activity Buffer Request        27.90%       1.452ms        27.90%       1.452ms       1.452ms      40.192us         1.15%      40.192us      40.192us             1  
-                                        aten::transpose         1.49%      77.390us         2.00%     104.302us       4.346us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.52%      26.912us         0.52%      26.912us       1.121us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.55%      28.453us         2.13%     110.953us       7.397us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.93%     100.211us         1.93%     100.211us       4.175us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         2.45%     127.363us         2.45%     127.363us       8.491us       0.000us         0.00%       0.000us       0.000us            15  
-                                    aten::empty_strided         0.32%      16.580us         0.32%      16.580us       5.527us       0.000us         0.00%       0.000us       0.000us             3  
-                                 cudaDeviceGetAttribute         0.05%       2.441us         0.05%       2.441us       0.407us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.18%       9.241us         0.18%       9.241us       3.080us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        52.58%       2.737ms        52.58%       2.737ms       2.737ms       0.000us         0.00%       0.000us       0.000us             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.600ms       101.99%       3.600ms       3.600ms             1  
+                                         torch_flash_ma         6.70%     350.157us        46.68%       2.439ms       2.439ms       0.000us         0.00%       3.570ms       3.570ms             1  
+                     aten::scaled_dot_product_attention         0.81%      42.281us         4.26%     222.626us      74.209us       0.000us         0.00%       2.816ms     938.781us             3  
+              aten::_scaled_dot_product_flash_attention         0.52%      27.002us         3.45%     180.345us      60.115us       0.000us         0.00%       2.816ms     938.781us             3  
+                         aten::_flash_attention_forward         0.79%      41.210us         2.54%     132.453us      44.151us       2.816ms        79.78%       2.816ms     938.781us             3  
+void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       2.816ms        79.78%       2.816ms     938.781us             3  
+                                       aten::contiguous         0.29%      15.041us        34.44%       1.800ms     149.962us       0.000us         0.00%     753.884us      62.824us            12  
+                                            aten::clone         0.75%      38.969us        34.15%       1.785ms     148.709us       0.000us         0.00%     753.884us      62.824us            12  
+                                            aten::copy_         1.73%      90.324us        31.78%       1.661ms     138.388us     713.788us        20.22%     753.884us      62.824us            12  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     713.788us        20.22%     713.788us      59.482us            12  
+                                Activity Buffer Request        28.08%       1.467ms        28.08%       1.467ms       1.467ms      40.096us         1.14%      40.096us      40.096us             1  
+                                        aten::transpose         1.25%      65.371us         1.68%      87.543us       3.648us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.42%      22.172us         0.42%      22.172us       0.924us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.53%      27.463us         2.06%     107.524us       7.168us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.78%      93.220us         1.78%      93.220us       3.884us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         2.49%     130.035us         2.49%     130.035us       8.669us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.32%      16.730us         0.32%      16.730us       5.577us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.05%       2.690us         0.05%       2.690us       0.448us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.17%       9.000us         0.17%       9.000us       3.000us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        53.32%       2.786ms        53.32%       2.786ms       2.786ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 5.205ms
-Self CUDA time total: 3.509ms
+Self CPU time total: 5.225ms
+Self CUDA time total: 3.530ms
 
 
 
@@ -4239,29 +4239,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L256_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                         torch_flash_ma         4.72%     248.136us        41.78%       2.196ms       2.196ms       0.000us         0.00%       3.803ms       3.803ms             1  
-                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.759ms       100.28%       3.759ms       3.759ms             1  
-                     aten::scaled_dot_product_attention         0.51%      26.852us         3.40%     178.734us      59.578us       0.000us         0.00%       2.990ms     996.607us             3  
-              aten::_scaled_dot_product_flash_attention         0.35%      18.418us         2.89%     151.882us      50.627us       0.000us         0.00%       2.990ms     996.607us             3  
-                         aten::_flash_attention_forward         0.65%      34.063us         2.10%     110.562us      36.854us       2.990ms        79.76%       2.990ms     996.607us             3  
-void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       2.990ms        79.76%       2.990ms     996.607us             3  
-                                       aten::contiguous         0.19%      10.079us        32.75%       1.721ms     143.446us       0.000us         0.00%     813.629us      67.802us            12  
-                                            aten::clone         0.54%      28.151us        32.56%       1.711ms     142.606us       0.000us         0.00%     813.629us      67.802us            12  
-                                            aten::copy_         1.97%     103.281us        30.84%       1.621ms     135.084us     758.782us        20.24%     813.629us      67.802us            12  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     758.782us        20.24%     758.782us      63.232us            12  
-                                Activity Buffer Request        27.29%       1.434ms        27.29%       1.434ms       1.434ms      54.847us         1.46%      54.847us      54.847us             1  
-                                        aten::transpose         0.98%      51.741us         1.34%      70.423us       2.934us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.36%      18.682us         0.36%      18.682us       0.778us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.38%      19.848us         1.54%      80.939us       5.396us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.45%      76.001us         1.45%      76.001us       3.167us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         2.04%     106.952us         2.04%     106.952us       7.130us       0.000us         0.00%       0.000us       0.000us            15  
-                                    aten::empty_strided         0.26%      13.850us         0.26%      13.850us       4.617us       0.000us         0.00%       0.000us       0.000us             3  
-                                 cudaDeviceGetAttribute         0.04%       1.860us         0.04%       1.860us       0.310us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.07%       3.760us         0.07%       3.760us       1.253us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        58.22%       3.060ms        58.22%       3.060ms       3.060ms       0.000us         0.00%       0.000us       0.000us             1  
+                                         torch_flash_ma         4.88%     260.255us        42.26%       2.252ms       2.252ms       0.000us         0.00%       3.798ms       3.798ms             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.753ms       100.28%       3.753ms       3.753ms             1  
+                     aten::scaled_dot_product_attention         0.49%      25.890us         3.50%     186.735us      62.245us       0.000us         0.00%       2.976ms     991.858us             3  
+              aten::_scaled_dot_product_flash_attention         0.33%      17.842us         3.02%     160.845us      53.615us       0.000us         0.00%       2.976ms     991.858us             3  
+                         aten::_flash_attention_forward         0.74%      39.289us         2.26%     120.363us      40.121us       2.976ms        79.51%       2.976ms     991.858us             3  
+void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       2.976ms        79.51%       2.976ms     991.858us             3  
+                                       aten::contiguous         0.20%      10.403us        33.03%       1.760ms     146.680us       0.000us         0.00%     822.042us      68.504us            12  
+                                            aten::clone         0.53%      28.238us        32.84%       1.750ms     145.813us       0.000us         0.00%     822.042us      68.504us            12  
+                                            aten::copy_         1.51%      80.312us        31.12%       1.659ms     138.210us     766.874us        20.49%     822.042us      68.504us            12  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     766.874us        20.49%     766.874us      63.906us            12  
+                                Activity Buffer Request        28.02%       1.493ms        28.02%       1.493ms       1.493ms      55.168us         1.47%      55.168us      55.168us             1  
+                                        aten::transpose         0.94%      50.313us         1.27%      67.673us       2.820us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.33%      17.360us         0.33%      17.360us       0.723us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.40%      21.528us         1.56%      83.370us       5.558us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.43%      76.263us         1.43%      76.263us       3.178us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         2.08%     110.943us         2.08%     110.943us       7.396us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.27%      14.621us         0.27%      14.621us       4.874us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.03%       1.781us         0.03%       1.781us       0.297us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.08%       4.011us         0.08%       4.011us       1.337us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        57.74%       3.077ms        57.74%       3.077ms       3.077ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 5.255ms
-Self CUDA time total: 3.749ms
+Self CPU time total: 5.329ms
+Self CUDA time total: 3.742ms
 
 
 
@@ -4271,29 +4271,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L320_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                         torch_flash_ma         4.59%     242.054us        41.69%       2.201ms       2.201ms       0.000us         0.00%       3.795ms       3.795ms             1  
-                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.746ms       100.27%       3.746ms       3.746ms             1  
-                     aten::scaled_dot_product_attention         0.50%      26.150us         3.40%     179.413us      59.804us       0.000us         0.00%       2.957ms     985.581us             3  
-              aten::_scaled_dot_product_flash_attention         0.35%      18.371us         2.90%     153.263us      51.088us       0.000us         0.00%       2.957ms     985.581us             3  
-                         aten::_flash_attention_forward         0.64%      34.041us         2.11%     111.213us      37.071us       2.957ms        79.14%       2.957ms     985.581us             3  
-void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       2.957ms        79.14%       2.957ms     985.581us             3  
-                                       aten::contiguous         0.19%       9.991us        32.85%       1.734ms     144.489us       0.000us         0.00%     838.147us      69.846us            12  
-                                            aten::clone         0.52%      27.541us        32.66%       1.724ms     143.657us       0.000us         0.00%     838.147us      69.846us            12  
-                                            aten::copy_         1.47%      77.641us        30.91%       1.632ms     135.987us     779.363us        20.86%     838.147us      69.846us            12  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     779.363us        20.86%     779.363us      64.947us            12  
-                                Activity Buffer Request        27.89%       1.472ms        27.89%       1.472ms       1.472ms      58.784us         1.57%      58.784us      58.784us             1  
-                                        aten::transpose         0.96%      50.819us         1.31%      69.110us       2.880us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.35%      18.291us         0.35%      18.291us       0.762us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.38%      20.141us         1.58%      83.392us       5.559us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.49%      78.782us         1.49%      78.782us       3.283us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         1.99%     104.800us         1.99%     104.800us       6.987us       0.000us         0.00%       0.000us       0.000us            15  
-                                    aten::empty_strided         0.27%      14.320us         0.27%      14.320us       4.773us       0.000us         0.00%       0.000us       0.000us             3  
-                                 cudaDeviceGetAttribute         0.04%       1.870us         0.04%       1.870us       0.312us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.07%       3.720us         0.07%       3.720us       1.240us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        58.31%       3.078ms        58.31%       3.078ms       3.078ms       0.000us         0.00%       0.000us       0.000us             1  
+                                         torch_flash_ma         4.87%     262.676us        41.62%       2.245ms       2.245ms       0.000us         0.00%       3.882ms       3.882ms             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.834ms       100.29%       3.834ms       3.834ms             1  
+                     aten::scaled_dot_product_attention         0.50%      26.770us         3.49%     188.015us      62.672us       0.000us         0.00%       3.044ms       1.015ms             3  
+              aten::_scaled_dot_product_flash_attention         0.35%      18.803us         2.99%     161.245us      53.748us       0.000us         0.00%       3.044ms       1.015ms             3  
+                         aten::_flash_attention_forward         0.74%      39.829us         2.21%     119.102us      39.701us       3.044ms        79.61%       3.044ms       1.015ms             3  
+void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.044ms        79.61%       3.044ms       1.015ms             3  
+                                       aten::contiguous         0.18%       9.451us        32.36%       1.746ms     145.465us       0.000us         0.00%     838.367us      69.864us            12  
+                                            aten::clone         0.54%      28.881us        32.18%       1.736ms     144.678us       0.000us         0.00%     838.367us      69.864us            12  
+                                            aten::copy_         1.51%      81.201us        30.48%       1.644ms     137.016us     779.615us        20.39%     838.367us      69.864us            12  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     779.615us        20.39%     779.615us      64.968us            12  
+                                Activity Buffer Request        27.31%       1.473ms        27.31%       1.473ms       1.473ms      58.752us         1.54%      58.752us      58.752us             1  
+                                        aten::transpose         1.01%      54.592us         1.34%      72.471us       3.020us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.33%      17.879us         0.33%      17.879us       0.745us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.37%      20.117us         1.53%      82.751us       5.517us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.41%      76.295us         1.41%      76.295us       3.179us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         2.13%     114.795us         2.13%     114.795us       7.653us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.27%      14.801us         0.27%      14.801us       4.934us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.04%       2.110us         0.04%       2.110us       0.352us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.07%       3.990us         0.07%       3.990us       1.330us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        58.38%       3.149ms        58.38%       3.149ms       3.149ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 5.279ms
-Self CUDA time total: 3.736ms
+Self CPU time total: 5.395ms
+Self CUDA time total: 3.823ms
 
 
 
@@ -4303,29 +4303,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L384_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                         torch_flash_ma         4.47%     246.252us        42.66%       2.352ms       2.352ms       0.000us         0.00%       3.878ms       3.878ms             1  
-                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.831ms       100.28%       3.831ms       3.831ms             1  
-                     aten::scaled_dot_product_attention         0.47%      26.180us         3.22%     177.714us      59.238us       0.000us         0.00%       3.035ms       1.012ms             3  
-              aten::_scaled_dot_product_flash_attention         0.34%      18.934us         2.75%     151.534us      50.511us       0.000us         0.00%       3.035ms       1.012ms             3  
-                         aten::_flash_attention_forward         0.60%      33.169us         1.99%     109.931us      36.644us       3.035ms        79.45%       3.035ms       1.012ms             3  
-void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.035ms        79.45%       3.035ms       1.012ms             3  
-                                       aten::contiguous         0.19%      10.269us        34.14%       1.882ms     156.829us       0.000us         0.00%     843.264us      70.272us            12  
-                                            aten::clone         0.51%      27.861us        33.95%       1.872ms     155.974us       0.000us         0.00%     843.264us      70.272us            12  
-                                            aten::copy_         1.39%      76.612us        32.27%       1.779ms     148.225us     785.216us        20.55%     843.264us      70.272us            12  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     785.216us        20.55%     785.216us      65.435us            12  
-                                Activity Buffer Request        26.00%       1.433ms        26.00%       1.433ms       1.433ms      58.048us         1.52%      58.048us      58.048us             1  
-                                        aten::transpose         0.90%      49.620us         1.24%      68.282us       2.845us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.34%      18.662us         0.34%      18.662us       0.778us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.37%      20.139us         1.52%      83.911us       5.594us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.44%      79.524us         1.44%      79.524us       3.313us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         5.29%     291.664us         5.29%     291.664us      19.444us       0.000us         0.00%       0.000us       0.000us            15  
-                                    aten::empty_strided         0.25%      13.850us         0.25%      13.850us       4.617us       0.000us         0.00%       0.000us       0.000us             3  
-                                 cudaDeviceGetAttribute         0.03%       1.810us         0.03%       1.810us       0.302us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.07%       3.620us         0.07%       3.620us       1.207us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        57.34%       3.161ms        57.34%       3.161ms       3.161ms       0.000us         0.00%       0.000us       0.000us             1  
+                                         torch_flash_ma         4.61%     261.106us        43.54%       2.469ms       2.469ms       0.000us         0.00%       3.945ms       3.945ms             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.898ms       100.28%       3.898ms       3.898ms             1  
+                     aten::scaled_dot_product_attention         0.46%      26.241us         3.40%     192.654us      64.218us       0.000us         0.00%       3.100ms       1.033ms             3  
+              aten::_scaled_dot_product_flash_attention         0.34%      19.509us         2.94%     166.413us      55.471us       0.000us         0.00%       3.100ms       1.033ms             3  
+                         aten::_flash_attention_forward         0.74%      42.081us         2.16%     122.633us      40.878us       3.100ms        79.76%       3.100ms       1.033ms             3  
+void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.100ms        79.76%       3.100ms       1.033ms             3  
+                                       aten::contiguous         0.20%      11.161us        34.71%       1.968ms     163.994us       0.000us         0.00%     844.704us      70.392us            12  
+                                            aten::clone         0.52%      29.682us        34.51%       1.957ms     163.064us       0.000us         0.00%     844.704us      70.392us            12  
+                                            aten::copy_         1.45%      82.261us        32.81%       1.860ms     155.026us     786.784us        20.24%     844.704us      70.392us            12  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     786.784us        20.24%     786.784us      65.565us            12  
+                                Activity Buffer Request        26.26%       1.489ms        26.26%       1.489ms       1.489ms      57.920us         1.49%      57.920us      57.920us             1  
+                                        aten::transpose         0.95%      53.820us         1.26%      71.322us       2.972us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.31%      17.502us         0.31%      17.502us       0.729us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.39%      21.943us         1.53%      86.983us       5.799us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.40%      79.202us         1.40%      79.202us       3.300us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         5.55%     314.487us         5.55%     314.487us      20.966us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.26%      14.830us         0.26%      14.830us       4.943us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.04%       2.010us         0.04%       2.010us       0.335us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.07%       4.040us         0.07%       4.040us       1.347us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        56.46%       3.201ms        56.46%       3.201ms       3.201ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 5.512ms
-Self CUDA time total: 3.820ms
+Self CPU time total: 5.670ms
+Self CUDA time total: 3.887ms
 
 
 
@@ -4335,29 +4335,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L448_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                         torch_flash_ma         4.69%     283.303us        42.14%       2.547ms       2.547ms       0.000us         0.00%       4.304ms       4.304ms             1  
-                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       4.254ms       100.24%       4.254ms       4.254ms             1  
-                     aten::scaled_dot_product_attention         0.82%      49.722us         3.53%     213.285us      71.095us       0.000us         0.00%       3.439ms       1.146ms             3  
-              aten::_scaled_dot_product_flash_attention         0.34%      20.582us         2.71%     163.563us      54.521us       0.000us         0.00%       3.439ms       1.146ms             3  
-                         aten::_flash_attention_forward         0.62%      37.231us         1.93%     116.771us      38.924us       3.439ms        81.02%       3.439ms       1.146ms             3  
-void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.439ms        81.02%       3.439ms       1.146ms             3  
-                                       aten::contiguous         0.18%      10.912us        32.97%       1.993ms     166.068us       0.000us         0.00%     865.695us      72.141us            12  
-                                            aten::clone         0.50%      30.059us        32.79%       1.982ms     165.158us       0.000us         0.00%     865.695us      72.141us            12  
-                                            aten::copy_         1.39%      83.902us        31.17%       1.884ms     157.000us     805.439us        18.98%     865.695us      72.141us            12  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     805.439us        18.98%     805.439us      67.120us            12  
-                                Activity Buffer Request        24.08%       1.456ms        24.08%       1.456ms       1.456ms      60.256us         1.42%      60.256us      60.256us             1  
-                                        aten::transpose         1.06%      63.793us         1.39%      84.162us       3.507us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.34%      20.369us         0.34%      20.369us       0.849us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.36%      21.791us         1.46%      88.331us       5.889us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.33%      80.570us         1.33%      80.570us       3.357us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         6.09%     368.355us         6.09%     368.355us      24.557us       0.000us         0.00%       0.000us       0.000us            15  
-                                    aten::empty_strided         0.25%      15.000us         0.25%      15.000us       5.000us       0.000us         0.00%       0.000us       0.000us             3  
-                                 cudaDeviceGetAttribute         0.03%       1.990us         0.03%       1.990us       0.332us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.07%       4.160us         0.07%       4.160us       1.387us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        57.86%       3.497ms        57.86%       3.497ms       3.497ms       0.000us         0.00%       0.000us       0.000us             1  
+                                         torch_flash_ma         5.12%     312.519us        40.82%       2.493ms       2.493ms       0.000us         0.00%       4.416ms       4.416ms             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       4.365ms       100.24%       4.365ms       4.365ms             1  
+                     aten::scaled_dot_product_attention         0.42%      25.922us         3.20%     195.246us      65.082us       0.000us         0.00%       3.547ms       1.182ms             3  
+              aten::_scaled_dot_product_flash_attention         0.34%      20.847us         2.77%     169.324us      56.441us       0.000us         0.00%       3.547ms       1.182ms             3  
+                         aten::_flash_attention_forward         0.72%      44.243us         2.07%     126.303us      42.101us       3.547ms        81.45%       3.547ms       1.182ms             3  
+void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.547ms        81.45%       3.547ms       1.182ms             3  
+                                       aten::contiguous         0.17%      10.559us        31.73%       1.938ms     161.473us       0.000us         0.00%     869.122us      72.427us            12  
+                                            aten::clone         0.47%      28.763us        31.56%       1.927ms     160.593us       0.000us         0.00%     869.122us      72.427us            12  
+                                            aten::copy_         1.36%      83.033us        30.01%       1.832ms     152.707us     807.906us        18.55%     869.122us      72.427us            12  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     807.906us        18.55%     807.906us      67.326us            12  
+                                Activity Buffer Request        24.51%       1.497ms        24.51%       1.497ms       1.497ms      61.216us         1.41%      61.216us      61.216us             1  
+                                        aten::transpose         0.85%      52.195us         1.14%      69.864us       2.911us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.29%      17.669us         0.29%      17.669us       0.736us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.34%      20.921us         1.44%      87.791us       5.853us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.30%      79.270us         1.30%      79.270us       3.303us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         4.55%     277.575us         4.55%     277.575us      18.505us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.27%      16.520us         0.27%      16.520us       5.507us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.03%       1.960us         0.03%       1.960us       0.327us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.07%       4.040us         0.07%       4.040us       1.347us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        59.18%       3.614ms        59.18%       3.614ms       3.614ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 6.045ms
-Self CUDA time total: 4.244ms
+Self CPU time total: 6.107ms
+Self CUDA time total: 4.355ms
 
 
 
@@ -4367,45 +4367,39 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L512_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                         torch_flash_ma         4.04%     248.485us        39.71%       2.440ms       2.440ms       0.000us         0.00%       4.431ms       4.431ms             1  
-                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       4.380ms       100.24%       4.380ms       4.380ms             1  
-                     aten::scaled_dot_product_attention         0.42%      25.679us         2.90%     178.082us      59.361us       0.000us         0.00%       3.552ms       1.184ms             3  
-              aten::_scaled_dot_product_flash_attention         0.29%      17.912us         2.48%     152.403us      50.801us       0.000us         0.00%       3.552ms       1.184ms             3  
-                         aten::_flash_attention_forward         0.56%      34.360us         1.81%     111.452us      37.151us       3.552ms        81.28%       3.552ms       1.184ms             3  
-void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.552ms        81.28%       3.552ms       1.184ms             3  
-                                       aten::contiguous         0.17%      10.359us        32.01%       1.967ms     163.915us       0.000us         0.00%     879.392us      73.283us            12  
-                                            aten::clone         0.45%      27.371us        31.84%       1.957ms     163.052us       0.000us         0.00%     879.392us      73.283us            12  
-                                            aten::copy_         1.33%      81.681us        30.34%       1.864ms     155.367us     818.048us        18.72%     879.392us      73.283us            12  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     818.048us        18.72%     818.048us      68.171us            12  
-                                Activity Buffer Request        23.48%       1.443ms        23.48%       1.443ms       1.443ms      61.344us         1.40%      61.344us      61.344us             1  
-                                        aten::transpose         0.84%      51.433us         1.14%      69.901us       2.913us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.30%      18.468us         0.30%      18.468us       0.769us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.32%      19.754us         1.37%      83.993us       5.600us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.26%      77.740us         1.26%      77.740us       3.239us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         5.92%     364.005us         5.92%     364.005us      24.267us       0.000us         0.00%       0.000us       0.000us            15  
-                                    aten::empty_strided         0.23%      14.381us         0.23%      14.381us       4.794us       0.000us         0.00%       0.000us       0.000us             3  
-                                 cudaDeviceGetAttribute         0.03%       1.840us         0.03%       1.840us       0.307us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.07%       4.180us         0.07%       4.180us       1.393us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        60.29%       3.705ms        60.29%       3.705ms       3.705ms       0.000us         0.00%       0.000us       0.000us             1  
+                                         torch_flash_ma         3.85%     236.256us        38.02%       2.335ms       2.335ms       0.000us         0.00%       4.535ms       4.535ms             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       4.485ms       100.25%       4.485ms       4.485ms             1  
+                     aten::scaled_dot_product_attention         0.43%      26.452us         2.98%     183.275us      61.092us       0.000us         0.00%       3.655ms       1.218ms             3  
+              aten::_scaled_dot_product_flash_attention         0.30%      18.620us         2.55%     156.823us      52.274us       0.000us         0.00%       3.655ms       1.218ms             3  
+                         aten::_flash_attention_forward         0.59%      36.060us         1.88%     115.323us      38.441us       3.655ms        81.69%       3.655ms       1.218ms             3  
+void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.655ms        81.69%       3.655ms       1.218ms             3  
+                                       aten::contiguous         0.16%       9.770us        30.40%       1.867ms     155.567us       0.000us         0.00%     880.065us      73.339us            12  
+                                            aten::clone         0.46%      28.179us        30.24%       1.857ms     154.753us       0.000us         0.00%     880.065us      73.339us            12  
+                                            aten::copy_         1.36%      83.563us        28.74%       1.765ms     147.054us     819.137us        18.31%     880.065us      73.339us            12  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     819.137us        18.31%     819.137us      68.261us            12  
+                                Activity Buffer Request        23.24%       1.427ms        23.24%       1.427ms       1.427ms      60.928us         1.36%      60.928us      60.928us             1  
+                                        aten::transpose         0.86%      52.980us         1.16%      71.060us       2.961us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.29%      18.080us         0.29%      18.080us       0.753us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.34%      20.930us         1.37%      83.913us       5.594us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.25%      77.043us         1.25%      77.043us       3.210us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         4.54%     278.990us         4.54%     278.990us      18.599us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.24%      14.661us         0.24%      14.661us       4.887us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.03%       1.978us         0.03%       1.978us       0.330us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.06%       3.901us         0.06%       3.901us       1.300us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        61.98%       3.806ms        61.98%       3.806ms       3.806ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 6.146ms
-Self CUDA time total: 4.370ms
+Self CPU time total: 6.141ms
+Self CUDA time total: 4.474ms
 
 
 impl                     wl                  p50(ms)  ok
 torch_flash_ma           cuda_attn_L128_bfloat16     1.22  True
-torch_flash_ma           cuda_attn_L256_bfloat16     1.27  True
-torch_flash_ma           cuda_attn_L320_bfloat16     1.29  True
-torch_flash_ma           cuda_attn_L384_bfloat16     1.30  True
-torch_flash_ma           cuda_attn_L448_bfloat16     1.45  True
-torch_flash_ma           cuda_attn_L512_bfloat16     1.49  True
+torch_flash_ma           cuda_attn_L256_bfloat16     1.28  True
+torch_flash_ma           cuda_attn_L320_bfloat16     1.30  True
+torch_flash_ma           cuda_attn_L384_bfloat16     1.33  True
+torch_flash_ma           cuda_attn_L448_bfloat16     1.50  True
+torch_flash_ma           cuda_attn_L512_bfloat16     1.51  True
 </pre></div>
-<div class="uv-install-logs" id="uv-logs-benchmark">
-<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
-<div class="uv-logs-content" style="display: none;">
-Installed 37 packages in 225ms
-</div>
-</div>
 <div class="cell-artifacts">
 <h4>Artifacts:</h4>
 <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
diff --git a/flash_attn/impls/hf_kernels_flash_attn.html b/flash_attn/impls/hf_kernels_flash_attn.html
index 0c6eeb07699e5badcea2a599fa3141678ce81b07..b43f3b2c4b9504821051f29d094124c270a7e0ee 100644
--- a/flash_attn/impls/hf_kernels_flash_attn.html
+++ b/flash_attn/impls/hf_kernels_flash_attn.html
@@ -4104,14 +4104,14 @@ body[data-tool="eraser"] .main-content {
 <span class="collapse-indicators">
 <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span> 
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
- <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
+ <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> | 
-Cell: benchmark | 10.91s
+Cell: benchmark | 5.83s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
-<a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/hf_kernels_flash_attn.md" target="_blank" class="github-btn">GitHub</a>
-<a href="https://huggingface.co/kernels-community/flash-attn2" target="_blank" class="hf-btn">🤗 HF</a>
+<a href="https://github.com/huggingface/kernels-benchmarks/blob/main/flash_attn/impls/hf_kernels_flash_attn.md" target="_blank" class="github-btn">GitHub</a>
+<a href="https://huggingface.co/kernels-community/flash-attn" target="_blank" class="hf-btn">🤗 HF</a>
 </div>
 <div id="code-benchmark" class="cell-code" data-lines="32">
 <div class="code-wrap">
@@ -4161,21 +4161,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L128_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_flash_attn         3.74%     162.312us        41.68%       1.808ms       1.808ms       0.000us         0.00%       3.686ms       3.686ms             1  
-                               _flash_attn_9e27194::fwd         1.67%      72.360us        37.94%       1.646ms     548.560us       2.753ms       100.00%       3.686ms       1.229ms             3  
-                                  hf_kernels_flash_attn         0.00%       0.000us         0.00%       0.000us       0.000us       2.754ms       100.05%       2.754ms       2.754ms             1  
-void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits...         0.00%       0.000us         0.00%       0.000us       0.000us       2.753ms       100.00%       2.753ms     917.639us             3  
-                                Activity Buffer Request        33.08%       1.435ms        33.08%       1.435ms       1.435ms     933.501us        33.91%     933.501us     933.501us             1  
-                                 cudaDeviceGetAttribute         0.12%       5.209us         0.12%       5.209us       0.347us       0.000us         0.00%       0.000us       0.000us            15  
-                                       aten::empty_like         0.42%      18.210us         1.24%      53.790us      17.930us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         0.82%      35.580us         0.82%      35.580us      11.860us       0.000us         0.00%       0.000us       0.000us             3  
-                                            aten::empty         0.58%      25.153us         0.58%      25.153us       2.795us       0.000us         0.00%       0.000us       0.000us             9  
-                                   cudaFuncSetAttribute         0.26%      11.441us         0.26%      11.441us       3.814us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         0.99%      42.781us         0.99%      42.781us      14.260us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        58.32%       2.530ms        58.32%       2.530ms       2.530ms       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_flash_attn         3.51%     153.413us        41.11%       1.797ms       1.797ms       0.000us         0.00%       3.733ms       3.733ms             1  
+                               _flash_attn_9e27194::fwd         1.62%      70.702us        37.60%       1.644ms     547.894us       2.785ms       100.00%       3.733ms       1.244ms             3  
+                                  hf_kernels_flash_attn         0.00%       0.000us         0.00%       0.000us       0.000us       2.786ms       100.05%       2.786ms       2.786ms             1  
+void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits...         0.00%       0.000us         0.00%       0.000us       0.000us       2.785ms       100.00%       2.785ms     928.303us             3  
+                                Activity Buffer Request        32.92%       1.439ms        32.92%       1.439ms       1.439ms     947.706us        34.03%     947.706us     947.706us             1  
+                                 cudaDeviceGetAttribute         0.11%       4.891us         0.11%       4.891us       0.326us       0.000us         0.00%       0.000us       0.000us            15  
+                                       aten::empty_like         0.37%      16.181us         1.17%      51.061us      17.020us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         0.80%      34.880us         0.80%      34.880us      11.627us       0.000us         0.00%       0.000us       0.000us             3  
+                                            aten::empty         0.59%      25.681us         0.59%      25.681us       2.853us       0.000us         0.00%       0.000us       0.000us             9  
+                                   cudaFuncSetAttribute         0.26%      11.340us         0.26%      11.340us       3.780us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         0.93%      40.731us         0.93%      40.731us      13.577us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        58.89%       2.575ms        58.89%       2.575ms       2.575ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 4.338ms
-Self CUDA time total: 2.753ms
+Self CPU time total: 4.372ms
+Self CUDA time total: 2.785ms
 
 
 
@@ -4185,21 +4185,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L256_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_flash_attn         2.52%     113.464us        37.14%       1.670ms       1.670ms       0.000us         0.00%       3.984ms       3.984ms             1  
-                               _flash_attn_9e27194::fwd         1.10%      49.632us        34.61%       1.557ms     518.855us       2.977ms       100.00%       3.984ms       1.328ms             3  
-                                  hf_kernels_flash_attn         0.00%       0.000us         0.00%       0.000us       0.000us       2.979ms       100.05%       2.979ms       2.979ms             1  
-void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits...         0.00%       0.000us         0.00%       0.000us       0.000us       2.977ms       100.00%       2.977ms     992.348us             3  
-                                Activity Buffer Request        31.69%       1.425ms        31.69%       1.425ms       1.425ms       1.007ms        33.82%       1.007ms       1.007ms             1  
-                                 cudaDeviceGetAttribute         0.08%       3.769us         0.08%       3.769us       0.251us       0.000us         0.00%       0.000us       0.000us            15  
-                                       aten::empty_like         0.17%       7.560us         0.54%      24.080us       8.027us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         0.37%      16.520us         0.37%      16.520us       5.507us       0.000us         0.00%       0.000us       0.000us             3  
-                                            aten::empty         0.47%      21.170us         0.47%      21.170us       2.352us       0.000us         0.00%       0.000us       0.000us             9  
-                                   cudaFuncSetAttribute         0.08%       3.820us         0.08%       3.820us       1.273us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         0.64%      28.910us         0.64%      28.910us       9.637us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        62.86%       2.827ms        62.86%       2.827ms       2.827ms       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_flash_attn         1.94%      86.682us        37.50%       1.676ms       1.676ms       0.000us         0.00%       3.929ms       3.929ms             1  
+                               _flash_attn_9e27194::fwd         1.06%      47.570us        35.56%       1.589ms     529.734us       2.938ms       100.00%       3.929ms       1.310ms             3  
+                                  hf_kernels_flash_attn         0.00%       0.000us         0.00%       0.000us       0.000us       2.939ms       100.05%       2.939ms       2.939ms             1  
+void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits...         0.00%       0.000us         0.00%       0.000us       0.000us       2.938ms       100.00%       2.938ms     979.209us             3  
+                                Activity Buffer Request        32.66%       1.460ms        32.66%       1.460ms       1.460ms     991.166us        33.74%     991.166us     991.166us             1  
+                                 cudaDeviceGetAttribute         0.10%       4.450us         0.10%       4.450us       0.297us       0.000us         0.00%       0.000us       0.000us            15  
+                                       aten::empty_like         0.19%       8.440us         0.55%      24.690us       8.230us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         0.36%      16.250us         0.36%      16.250us       5.417us       0.000us         0.00%       0.000us       0.000us             3  
+                                            aten::empty         0.51%      22.872us         0.51%      22.872us       2.541us       0.000us         0.00%       0.000us       0.000us             9  
+                                   cudaFuncSetAttribute         0.07%       3.350us         0.07%       3.350us       1.117us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         0.60%      26.611us         0.60%      26.611us       8.870us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        62.50%       2.794ms        62.50%       2.794ms       2.794ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 4.497ms
-Self CUDA time total: 2.977ms
+Self CPU time total: 4.469ms
+Self CUDA time total: 2.938ms
 
 
 
@@ -4209,21 +4209,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L320_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_flash_attn         2.39%     108.133us        36.58%       1.655ms       1.655ms       0.000us         0.00%       4.040ms       4.040ms             1  
-                               _flash_attn_9e27194::fwd         1.06%      48.029us        34.19%       1.547ms     515.608us       3.016ms       100.00%       4.040ms       1.347ms             3  
-                                  hf_kernels_flash_attn         0.00%       0.000us         0.00%       0.000us       0.000us       3.017ms       100.05%       3.017ms       3.017ms             1  
-void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits...         0.00%       0.000us         0.00%       0.000us       0.000us       3.016ms       100.00%       3.016ms       1.005ms             3  
-                                Activity Buffer Request        31.28%       1.415ms        31.28%       1.415ms       1.415ms       1.024ms        33.96%       1.024ms       1.024ms             1  
-                                 cudaDeviceGetAttribute         0.09%       4.281us         0.09%       4.281us       0.285us       0.000us         0.00%       0.000us       0.000us            15  
-                                       aten::empty_like         0.16%       7.121us         0.52%      23.411us       7.804us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         0.36%      16.290us         0.36%      16.290us       5.430us       0.000us         0.00%       0.000us       0.000us             3  
-                                            aten::empty         0.49%      22.080us         0.49%      22.080us       2.453us       0.000us         0.00%       0.000us       0.000us             9  
-                                   cudaFuncSetAttribute         0.08%       3.840us         0.08%       3.840us       1.280us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         0.66%      29.710us         0.66%      29.710us       9.903us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        63.42%       2.870ms        63.42%       2.870ms       2.870ms       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_flash_attn         2.38%     109.313us        36.70%       1.683ms       1.683ms       0.000us         0.00%       4.081ms       4.081ms             1  
+                               _flash_attn_9e27194::fwd         1.05%      48.167us        34.31%       1.574ms     524.567us       3.048ms       100.00%       4.081ms       1.360ms             3  
+                                  hf_kernels_flash_attn         0.00%       0.000us         0.00%       0.000us       0.000us       3.049ms       100.05%       3.049ms       3.049ms             1  
+void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits...         0.00%       0.000us         0.00%       0.000us       0.000us       3.048ms       100.00%       3.048ms       1.016ms             3  
+                                Activity Buffer Request        31.46%       1.443ms        31.46%       1.443ms       1.443ms       1.033ms        33.90%       1.033ms       1.033ms             1  
+                                 cudaDeviceGetAttribute         0.09%       4.231us         0.09%       4.231us       0.282us       0.000us         0.00%       0.000us       0.000us            15  
+                                       aten::empty_like         0.16%       7.250us         0.52%      23.960us       7.987us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         0.36%      16.710us         0.36%      16.710us       5.570us       0.000us         0.00%       0.000us       0.000us             3  
+                                            aten::empty         0.46%      21.300us         0.46%      21.300us       2.367us       0.000us         0.00%       0.000us       0.000us             9  
+                                   cudaFuncSetAttribute         0.08%       3.561us         0.08%       3.561us       1.187us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         0.64%      29.473us         0.64%      29.473us       9.824us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        63.30%       2.903ms        63.30%       2.903ms       2.903ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 4.525ms
-Self CUDA time total: 3.016ms
+Self CPU time total: 4.586ms
+Self CUDA time total: 3.048ms
 
 
 
@@ -4233,21 +4233,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L384_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_flash_attn         2.32%     109.992us        39.04%       1.848ms       1.848ms       0.000us         0.00%       4.060ms       4.060ms             1  
-                               _flash_attn_9e27194::fwd         1.05%      49.564us        36.71%       1.738ms     579.317us       3.035ms       100.00%       4.060ms       1.353ms             3  
-                                  hf_kernels_flash_attn         0.00%       0.000us         0.00%       0.000us       0.000us       3.037ms       100.05%       3.037ms       3.037ms             1  
-void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits...         0.00%       0.000us         0.00%       0.000us       0.000us       3.035ms       100.00%       3.035ms       1.012ms             3  
-                                Activity Buffer Request        29.72%       1.407ms        29.72%       1.407ms       1.407ms       1.025ms        33.76%       1.025ms       1.025ms             1  
-                                 cudaDeviceGetAttribute         0.08%       3.690us         0.08%       3.690us       0.246us       0.000us         0.00%       0.000us       0.000us            15  
-                                       aten::empty_like         0.16%       7.770us         0.54%      25.380us       8.460us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         0.37%      17.610us         0.37%      17.610us       5.870us       0.000us         0.00%       0.000us       0.000us             3  
-                                            aten::empty         0.47%      22.139us         0.47%      22.139us       2.460us       0.000us         0.00%       0.000us       0.000us             9  
-                                   cudaFuncSetAttribute         0.08%       3.790us         0.08%       3.790us       1.263us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         4.78%     226.343us         4.78%     226.343us      75.448us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        60.96%       2.886ms        60.96%       2.886ms       2.886ms       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_flash_attn         2.13%     103.094us        38.83%       1.884ms       1.884ms       0.000us         0.00%       4.165ms       4.165ms             1  
+                               _flash_attn_9e27194::fwd         0.99%      47.838us        36.71%       1.781ms     593.521us       3.114ms       100.00%       4.165ms       1.388ms             3  
+                                  hf_kernels_flash_attn         0.00%       0.000us         0.00%       0.000us       0.000us       3.116ms       100.05%       3.116ms       3.116ms             1  
+void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits...         0.00%       0.000us         0.00%       0.000us       0.000us       3.114ms       100.00%       3.114ms       1.038ms             3  
+                                Activity Buffer Request        29.59%       1.435ms        29.59%       1.435ms       1.435ms       1.051ms        33.75%       1.051ms       1.051ms             1  
+                                 cudaDeviceGetAttribute         0.08%       3.800us         0.08%       3.800us       0.253us       0.000us         0.00%       0.000us       0.000us            15  
+                                       aten::empty_like         0.16%       7.891us         0.53%      25.811us       8.604us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         0.37%      17.920us         0.37%      17.920us       5.973us       0.000us         0.00%       0.000us       0.000us             3  
+                                            aten::empty         0.45%      21.731us         0.45%      21.731us       2.415us       0.000us         0.00%       0.000us       0.000us             9  
+                                   cudaFuncSetAttribute         0.08%       3.740us         0.08%       3.740us       1.247us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         4.99%     242.187us         4.99%     242.187us      80.729us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        61.17%       2.967ms        61.17%       2.967ms       2.967ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 4.734ms
-Self CUDA time total: 3.035ms
+Self CPU time total: 4.851ms
+Self CUDA time total: 3.114ms
 
 
 
@@ -4257,21 +4257,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L448_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_flash_attn         2.11%     110.542us        35.45%       1.860ms       1.860ms       0.000us         0.00%       4.719ms       4.719ms             1  
-                               _flash_attn_9e27194::fwd         0.97%      51.080us        33.34%       1.750ms     583.220us       3.535ms       100.00%       4.719ms       1.573ms             3  
-                                  hf_kernels_flash_attn         0.00%       0.000us         0.00%       0.000us       0.000us       3.537ms       100.04%       3.537ms       3.537ms             1  
-void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits...         0.00%       0.000us         0.00%       0.000us       0.000us       3.535ms       100.00%       3.535ms       1.178ms             3  
-                                Activity Buffer Request        27.95%       1.467ms        27.95%       1.467ms       1.467ms       1.184ms        33.49%       1.184ms       1.184ms             1  
-                                 cudaDeviceGetAttribute         0.07%       3.640us         0.07%       3.640us       0.243us       0.000us         0.00%       0.000us       0.000us            15  
-                                       aten::empty_like         0.14%       7.520us         0.47%      24.731us       8.244us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         0.33%      17.211us         0.33%      17.211us       5.737us       0.000us         0.00%       0.000us       0.000us             3  
-                                            aten::empty         0.43%      22.670us         0.43%      22.670us       2.519us       0.000us         0.00%       0.000us       0.000us             9  
-                                   cudaFuncSetAttribute         0.07%       3.800us         0.07%       3.800us       1.267us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         3.37%     176.824us         3.37%     176.824us      58.941us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        64.55%       3.388ms        64.55%       3.388ms       3.388ms       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_flash_attn         2.00%     105.522us        34.61%       1.828ms       1.828ms       0.000us         0.00%       4.806ms       4.806ms             1  
+                               _flash_attn_9e27194::fwd         0.94%      49.622us        32.62%       1.723ms     574.192us       3.597ms       100.00%       4.806ms       1.602ms             3  
+                                  hf_kernels_flash_attn         0.00%       0.000us         0.00%       0.000us       0.000us       3.599ms       100.05%       3.599ms       3.599ms             1  
+void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits...         0.00%       0.000us         0.00%       0.000us       0.000us       3.597ms       100.00%       3.597ms       1.199ms             3  
+                                Activity Buffer Request        27.37%       1.446ms        27.37%       1.446ms       1.446ms       1.209ms        33.59%       1.209ms       1.209ms             1  
+                                 cudaDeviceGetAttribute         0.08%       3.991us         0.08%       3.991us       0.266us       0.000us         0.00%       0.000us       0.000us            15  
+                                       aten::empty_like         0.14%       7.250us         0.47%      24.620us       8.207us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         0.33%      17.370us         0.33%      17.370us       5.790us       0.000us         0.00%       0.000us       0.000us             3  
+                                            aten::empty         0.41%      21.681us         0.41%      21.681us       2.409us       0.000us         0.00%       0.000us       0.000us             9  
+                                   cudaFuncSetAttribute         0.07%       3.770us         0.07%       3.770us       1.257us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         3.28%     173.384us         3.28%     173.384us      57.795us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        65.39%       3.453ms        65.39%       3.453ms       3.453ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 5.248ms
-Self CUDA time total: 3.535ms
+Self CPU time total: 5.281ms
+Self CUDA time total: 3.597ms
 
 
 
@@ -4281,41 +4281,36 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L512_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_flash_attn         2.24%     118.861us        34.58%       1.832ms       1.832ms       0.000us         0.00%       4.834ms       4.834ms             1  
-                               _flash_attn_9e27194::fwd         0.90%      47.900us        32.34%       1.713ms     571.163us       3.618ms       100.00%       4.834ms       1.611ms             3  
-                                  hf_kernels_flash_attn         0.00%       0.000us         0.00%       0.000us       0.000us       3.619ms       100.04%       3.619ms       3.619ms             1  
-void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits...         0.00%       0.000us         0.00%       0.000us       0.000us       3.618ms       100.00%       3.618ms       1.206ms             3  
-                                Activity Buffer Request        27.32%       1.448ms        27.32%       1.448ms       1.448ms       1.217ms        33.63%       1.217ms       1.217ms             1  
-                                 cudaDeviceGetAttribute         0.07%       3.661us         0.07%       3.661us       0.244us       0.000us         0.00%       0.000us       0.000us            15  
-                                       aten::empty_like         0.14%       7.320us         0.50%      26.231us       8.744us       0.000us         0.00%       0.000us       0.000us             3  
-                                    aten::empty_strided         0.36%      18.911us         0.36%      18.911us       6.304us       0.000us         0.00%       0.000us       0.000us             3  
-                                            aten::empty         0.40%      21.351us         0.40%      21.351us       2.372us       0.000us         0.00%       0.000us       0.000us             9  
-                                   cudaFuncSetAttribute         0.08%       4.160us         0.08%       4.160us       1.387us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         3.07%     162.463us         3.07%     162.463us      54.154us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        65.42%       3.466ms        65.42%       3.466ms       3.466ms       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_flash_attn         2.02%     107.892us        33.82%       1.810ms       1.810ms       0.000us         0.00%       4.930ms       4.930ms             1  
+                               _flash_attn_9e27194::fwd         0.91%      48.918us        31.80%       1.702ms     567.268us       3.687ms       100.00%       4.930ms       1.643ms             3  
+                                  hf_kernels_flash_attn         0.00%       0.000us         0.00%       0.000us       0.000us       3.689ms       100.04%       3.689ms       3.689ms             1  
+void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits...         0.00%       0.000us         0.00%       0.000us       0.000us       3.687ms       100.00%       3.687ms       1.229ms             3  
+                                Activity Buffer Request        26.86%       1.437ms        26.86%       1.437ms       1.437ms       1.242ms        33.69%       1.242ms       1.242ms             1  
+                                 cudaDeviceGetAttribute         0.07%       3.881us         0.07%       3.881us       0.259us       0.000us         0.00%       0.000us       0.000us            15  
+                                       aten::empty_like         0.14%       7.591us         0.49%      26.111us       8.704us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         0.35%      18.520us         0.35%      18.520us       6.173us       0.000us         0.00%       0.000us       0.000us             3  
+                                            aten::empty         0.39%      20.640us         0.39%      20.640us       2.293us       0.000us         0.00%       0.000us       0.000us             9  
+                                   cudaFuncSetAttribute         0.07%       3.561us         0.07%       3.561us       1.187us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         3.01%     161.306us         3.01%     161.306us      53.769us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        66.18%       3.542ms        66.18%       3.542ms       3.542ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 5.299ms
-Self CUDA time total: 3.618ms
+Self CPU time total: 5.351ms
+Self CUDA time total: 3.687ms
 
 
 impl                     wl                  p50(ms)  ok
-hf_kernels_flash_attn    cuda_attn_L128_bfloat16     0.94  True
-hf_kernels_flash_attn    cuda_attn_L256_bfloat16     0.99  True
-hf_kernels_flash_attn    cuda_attn_L320_bfloat16     1.03  True
-hf_kernels_flash_attn    cuda_attn_L384_bfloat16     1.05  True
-hf_kernels_flash_attn    cuda_attn_L448_bfloat16     1.21  True
+hf_kernels_flash_attn    cuda_attn_L128_bfloat16     0.95  True
+hf_kernels_flash_attn    cuda_attn_L256_bfloat16     1.00  True
+hf_kernels_flash_attn    cuda_attn_L320_bfloat16     1.05  True
+hf_kernels_flash_attn    cuda_attn_L384_bfloat16     1.06  True
+hf_kernels_flash_attn    cuda_attn_L448_bfloat16     1.23  True
 hf_kernels_flash_attn    cuda_attn_L512_bfloat16     1.23  True
 </pre></div>
-<div class="uv-install-logs" id="uv-logs-benchmark">
-<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
-<div class="uv-logs-content" style="display: none;">
-Installed 15 packages in 15ms
+<div class="cell-stderr">
+Fetching 20 files:   0%|          | 0/20 [00:00&lt;?, ?it/s]
+Fetching 20 files:  10%|█         | 2/20 [00:01&lt;00:15,  1.19it/s]
+Fetching 20 files: 100%|██████████| 20/20 [00:01&lt;00:00, 11.87it/s]
 </div>
-</div>
-<div class="cell-stderr">Fetching 20 files:   0%|          | 0/20 [00:00&lt;?, ?it/s]
-Fetching 20 files:   5%|▌         | 1/20 [00:00&lt;00:02,  8.29it/s]
-Fetching 20 files:  10%|█         | 2/20 [00:06&lt;01:08,  3.82s/it]
-Fetching 20 files: 100%|██████████| 20/20 [00:06&lt;00:00,  3.06it/s]</div>
 <div class="cell-artifacts">
 <h4>Artifacts:</h4>
 <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
diff --git a/flash_attn/impls/hf_kernels_flash_attn3.html b/flash_attn/impls/hf_kernels_flash_attn3.html
index 16d419ea57e2fe2c3ccff8a3a3f19df88ec10363..a1db1794336426cb37d9956eacf119e09a093fa1 100644
--- a/flash_attn/impls/hf_kernels_flash_attn3.html
+++ b/flash_attn/impls/hf_kernels_flash_attn3.html
@@ -4106,11 +4106,11 @@ body[data-tool="eraser"] .main-content {
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> | 
-Cell: benchmark | 5.55s
+Cell: benchmark | 5.53s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
-<a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/hf_kernels_flash_attn3.md" target="_blank" class="github-btn">GitHub</a>
+<a href="https://github.com/huggingface/kernels-benchmarks/blob/main/flash_attn/impls/hf_kernels_flash_attn3.md" target="_blank" class="github-btn">GitHub</a>
 <a href="https://huggingface.co/kernels-community/flash-attn3" target="_blank" class="hf-btn">🤗 HF</a>
 </div>
 <div id="code-benchmark" class="cell-code" data-lines="31">
@@ -4160,19 +4160,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L128_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                 hf_kernels_flash_attn3         4.02%     170.054us        45.66%       1.931ms       1.931ms       0.000us         0.00%       3.489ms       3.489ms             1  
-                                          FlashAttnFunc         2.98%     126.112us        41.64%       1.761ms     586.890us       0.000us         0.00%       3.489ms       1.163ms             3  
-                        _flash_attn3_48fe103_dirty::fwd         1.85%      78.440us        38.65%       1.635ms     544.853us       2.605ms       100.00%       3.489ms       1.163ms             3  
-                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us       2.606ms       100.06%       2.606ms       2.606ms             1  
-void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       2.605ms       100.00%       2.605ms     868.221us             3  
-                                Activity Buffer Request        34.45%       1.457ms        34.45%       1.457ms       1.457ms     884.680us        33.97%     884.680us     884.680us             1  
-                                            aten::empty         1.07%      45.402us         1.07%      45.402us       7.567us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.29%      12.202us         0.29%      12.202us       4.067us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         0.99%      41.761us         0.99%      41.761us      13.920us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        54.34%       2.298ms        54.34%       2.298ms       2.298ms       0.000us         0.00%       0.000us       0.000us             1  
+                                 hf_kernels_flash_attn3         3.85%     171.193us        46.01%       2.045ms       2.045ms       0.000us         0.00%       3.614ms       3.614ms             1  
+                                          FlashAttnFunc         3.07%     136.295us        42.15%       1.874ms     624.570us       0.000us         0.00%       3.614ms       1.205ms             3  
+                        _flash_attn3_48fe103_dirty::fwd         1.94%      86.341us        39.09%       1.737ms     579.138us       2.720ms       100.00%       3.614ms       1.205ms             3  
+                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us       2.722ms       100.05%       2.722ms       2.722ms             1  
+void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       2.720ms       100.00%       2.720ms     906.698us             3  
+                                Activity Buffer Request        34.72%       1.543ms        34.72%       1.543ms       1.543ms     893.600us        32.85%     893.600us     893.600us             1  
+                                            aten::empty         1.07%      47.441us         1.07%      47.441us       7.907us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.31%      13.761us         0.31%      13.761us       4.587us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.05%      46.772us         1.05%      46.772us      15.591us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        53.99%       2.400ms        53.99%       2.400ms       2.400ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 4.229ms
-Self CUDA time total: 2.605ms
+Self CPU time total: 4.445ms
+Self CUDA time total: 2.720ms
 
 
 
@@ -4182,19 +4182,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L256_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                 hf_kernels_flash_attn3         2.90%     125.133us        41.34%       1.782ms       1.782ms       0.000us         0.00%       3.684ms       3.684ms             1  
-                                          FlashAttnFunc         2.10%      90.312us        38.43%       1.657ms     552.206us       0.000us         0.00%       3.684ms       1.228ms             3  
-                        _flash_attn3_48fe103_dirty::fwd         1.24%      53.461us        36.34%       1.566ms     522.102us       2.755ms       100.00%       3.684ms       1.228ms             3  
-                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us       2.756ms       100.06%       2.756ms       2.756ms             1  
-void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       2.755ms       100.00%       2.755ms     918.309us             3  
-                                Activity Buffer Request        33.60%       1.448ms        33.60%       1.448ms       1.448ms     929.157us        33.73%     929.157us     929.157us             1  
-                                            aten::empty         0.64%      27.380us         0.64%      27.380us       4.563us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.13%       5.449us         0.13%       5.449us       1.816us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         0.74%      31.802us         0.74%      31.802us      10.601us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        58.66%       2.529ms        58.66%       2.529ms       2.529ms       0.000us         0.00%       0.000us       0.000us             1  
+                                 hf_kernels_flash_attn3         2.41%     104.370us        41.13%       1.784ms       1.784ms       0.000us         0.00%       3.700ms       3.700ms             1  
+                                          FlashAttnFunc         2.00%      86.685us        38.73%       1.679ms     559.738us       0.000us         0.00%       3.700ms       1.233ms             3  
+                        _flash_attn3_48fe103_dirty::fwd         1.21%      52.631us        36.73%       1.593ms     530.843us       2.768ms       100.00%       3.700ms       1.233ms             3  
+                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us       2.769ms       100.06%       2.769ms       2.769ms             1  
+void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       2.768ms       100.00%       2.768ms     922.559us             3  
+                                Activity Buffer Request        34.10%       1.479ms        34.10%       1.479ms       1.479ms     932.127us        33.68%     932.127us     932.127us             1  
+                                            aten::empty         0.60%      25.981us         0.60%      25.981us       4.330us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.12%       5.050us         0.12%       5.050us       1.683us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         0.70%      30.140us         0.70%      30.140us      10.047us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        58.87%       2.553ms        58.87%       2.553ms       2.553ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 4.310ms
-Self CUDA time total: 2.755ms
+Self CPU time total: 4.336ms
+Self CUDA time total: 2.768ms
 
 
 
@@ -4204,19 +4204,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L320_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                 hf_kernels_flash_attn3         2.81%     125.615us        39.44%       1.762ms       1.762ms       0.000us         0.00%       3.917ms       3.917ms             1  
-                                          FlashAttnFunc         2.03%      90.880us        36.63%       1.637ms     545.546us       0.000us         0.00%       3.917ms       1.306ms             3  
-                        _flash_attn3_48fe103_dirty::fwd         1.20%      53.572us        34.59%       1.546ms     515.252us       2.927ms       100.00%       3.917ms       1.306ms             3  
-                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us       2.928ms       100.05%       2.928ms       2.928ms             1  
-void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       2.927ms       100.00%       2.927ms     975.593us             3  
-                                Activity Buffer Request        31.96%       1.428ms        31.96%       1.428ms       1.428ms     990.441us        33.84%     990.441us     990.441us             1  
-                                            aten::empty         0.63%      27.950us         0.63%      27.950us       4.658us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.12%       5.340us         0.12%       5.340us       1.780us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         0.68%      30.562us         0.68%      30.562us      10.187us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        60.56%       2.706ms        60.56%       2.706ms       2.706ms       0.000us         0.00%       0.000us       0.000us             1  
+                                 hf_kernels_flash_attn3         2.29%     102.411us        40.10%       1.791ms       1.791ms       0.000us         0.00%       3.875ms       3.875ms             1  
+                                          FlashAttnFunc         2.01%      89.903us        37.81%       1.688ms     562.801us       0.000us         0.00%       3.875ms       1.292ms             3  
+                        _flash_attn3_48fe103_dirty::fwd         1.18%      52.613us        35.79%       1.599ms     532.834us       2.892ms       100.00%       3.875ms       1.292ms             3  
+                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us       2.893ms       100.05%       2.893ms       2.893ms             1  
+void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       2.892ms       100.00%       2.892ms     963.972us             3  
+                                Activity Buffer Request        33.24%       1.485ms        33.24%       1.485ms       1.485ms     983.097us        33.99%     983.097us     983.097us             1  
+                                            aten::empty         0.58%      25.770us         0.58%      25.770us       4.295us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.11%       4.820us         0.11%       4.820us       1.607us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         0.69%      30.740us         0.69%      30.740us      10.247us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        59.90%       2.675ms        59.90%       2.675ms       2.675ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 4.469ms
-Self CUDA time total: 2.927ms
+Self CPU time total: 4.466ms
+Self CUDA time total: 2.892ms
 
 
 
@@ -4226,19 +4226,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L384_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                 hf_kernels_flash_attn3         2.73%     126.513us        42.04%       1.948ms       1.948ms       0.000us         0.00%       3.892ms       3.892ms             1  
-                                          FlashAttnFunc         2.03%      94.184us        39.31%       1.821ms     607.134us       0.000us         0.00%       3.892ms       1.297ms             3  
-                        _flash_attn3_48fe103_dirty::fwd         1.14%      52.959us        37.28%       1.727ms     575.740us       2.906ms       100.00%       3.892ms       1.297ms             3  
-                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us       2.908ms       100.05%       2.908ms       2.908ms             1  
-void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       2.906ms       100.00%       2.906ms     968.728us             3  
-                                Activity Buffer Request        30.69%       1.422ms        30.69%       1.422ms       1.422ms     985.540us        33.91%     985.540us     985.540us             1  
-                                            aten::empty         0.63%      29.361us         0.63%      29.361us       4.893us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.11%       5.241us         0.11%       5.241us       1.747us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         4.70%     217.965us         4.70%     217.965us      72.655us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        57.96%       2.685ms        57.96%       2.685ms       2.685ms       0.000us         0.00%       0.000us       0.000us             1  
+                                 hf_kernels_flash_attn3         2.68%     125.944us        42.11%       1.982ms       1.982ms       0.000us         0.00%       3.932ms       3.932ms             1  
+                                          FlashAttnFunc         1.98%      92.983us        39.44%       1.856ms     618.639us       0.000us         0.00%       3.932ms       1.311ms             3  
+                        _flash_attn3_48fe103_dirty::fwd         1.14%      53.661us        37.46%       1.763ms     587.645us       2.953ms       100.00%       3.932ms       1.311ms             3  
+                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us       2.954ms       100.06%       2.954ms       2.954ms             1  
+void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       2.953ms       100.00%       2.953ms     984.176us             3  
+                                Activity Buffer Request        30.48%       1.434ms        30.48%       1.434ms       1.434ms     979.803us        33.19%     979.803us     979.803us             1  
+                                            aten::empty         0.58%      27.450us         0.58%      27.450us       4.575us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.11%       5.150us         0.11%       5.150us       1.717us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         5.15%     242.396us         5.15%     242.396us      80.799us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        57.89%       2.724ms        57.89%       2.724ms       2.724ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 4.633ms
-Self CUDA time total: 2.906ms
+Self CPU time total: 4.706ms
+Self CUDA time total: 2.953ms
 
 
 
@@ -4248,19 +4248,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L448_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                 hf_kernels_flash_attn3         2.33%     120.764us        37.09%       1.922ms       1.922ms       0.000us         0.00%       4.645ms       4.645ms             1  
-                                          FlashAttnFunc         1.78%      92.240us        34.76%       1.801ms     600.384us       0.000us         0.00%       4.645ms       1.548ms             3  
-                        _flash_attn3_48fe103_dirty::fwd         1.04%      53.829us        32.98%       1.709ms     569.637us       3.482ms       100.00%       4.645ms       1.548ms             3  
-                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us       3.483ms       100.04%       3.483ms       3.483ms             1  
-void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       3.482ms       100.00%       3.482ms       1.161ms             3  
-                                Activity Buffer Request        27.80%       1.441ms        27.80%       1.441ms       1.441ms       1.163ms        33.40%       1.163ms       1.163ms             1  
-                                            aten::empty         0.54%      28.012us         0.54%      28.012us       4.669us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.10%       5.211us         0.10%       5.211us       1.737us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         3.50%     181.305us         3.50%     181.305us      60.435us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        62.91%       3.260ms        62.91%       3.260ms       3.260ms       0.000us         0.00%       0.000us       0.000us             1  
+                                 hf_kernels_flash_attn3         2.36%     122.892us        37.59%       1.960ms       1.960ms       0.000us         0.00%       4.622ms       4.622ms             1  
+                                          FlashAttnFunc         1.74%      90.533us        35.23%       1.837ms     612.429us       0.000us         0.00%       4.622ms       1.541ms             3  
+                        _flash_attn3_48fe103_dirty::fwd         0.97%      50.750us        33.49%       1.747ms     582.252us       3.470ms       100.00%       4.622ms       1.541ms             3  
+                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us       3.472ms       100.05%       3.472ms       3.472ms             1  
+void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       3.470ms       100.00%       3.470ms       1.157ms             3  
+                                Activity Buffer Request        27.49%       1.433ms        27.49%       1.433ms       1.433ms       1.152ms        33.20%       1.152ms       1.152ms             1  
+                                            aten::empty         0.51%      26.592us         0.51%      26.592us       4.432us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.10%       5.060us         0.10%       5.060us       1.687us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         4.43%     230.856us         4.43%     230.856us      76.952us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        62.41%       3.255ms        62.41%       3.255ms       3.255ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 5.182ms
-Self CUDA time total: 3.482ms
+Self CPU time total: 5.215ms
+Self CUDA time total: 3.470ms
 
 
 
@@ -4270,33 +4270,33 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L512_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                 hf_kernels_flash_attn3         2.54%     130.883us        37.28%       1.924ms       1.924ms       0.000us         0.00%       4.633ms       4.633ms             1  
-                                          FlashAttnFunc         1.80%      93.033us        34.74%       1.793ms     597.564us       0.000us         0.00%       4.633ms       1.544ms             3  
-                        _flash_attn3_48fe103_dirty::fwd         1.02%      52.583us        32.94%       1.700ms     566.553us       3.468ms       100.00%       4.633ms       1.544ms             3  
-                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us       3.469ms       100.04%       3.469ms       3.469ms             1  
+                                 hf_kernels_flash_attn3         2.32%     120.892us        37.51%       1.951ms       1.951ms       0.000us         0.00%       4.639ms       4.639ms             1  
+                                          FlashAttnFunc         1.74%      90.773us        35.18%       1.830ms     610.133us       0.000us         0.00%       4.639ms       1.546ms             3  
+                        _flash_attn3_48fe103_dirty::fwd         0.99%      51.351us        33.44%       1.740ms     579.875us       3.468ms       100.00%       4.639ms       1.546ms             3  
+                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us       3.469ms       100.05%       3.469ms       3.469ms             1  
 void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       3.468ms       100.00%       3.468ms       1.156ms             3  
-                                Activity Buffer Request        27.99%       1.444ms        27.99%       1.444ms       1.444ms       1.165ms        33.61%       1.165ms       1.165ms             1  
-                                            aten::empty         0.56%      29.150us         0.56%      29.150us       4.858us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.10%       5.050us         0.10%       5.050us       1.683us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         3.27%     168.763us         3.27%     168.763us      56.254us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        62.72%       3.236ms        62.72%       3.236ms       3.236ms       0.000us         0.00%       0.000us       0.000us             1  
+                                Activity Buffer Request        27.26%       1.418ms        27.26%       1.418ms       1.418ms       1.172ms        33.79%       1.172ms       1.172ms             1  
+                                            aten::empty         0.51%      26.560us         0.51%      26.560us       4.427us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.10%       5.101us         0.10%       5.101us       1.700us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         4.58%     238.367us         4.58%     238.367us      79.456us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        62.49%       3.251ms        62.49%       3.251ms       3.251ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 5.160ms
+Self CPU time total: 5.202ms
 Self CUDA time total: 3.468ms
 
 
 impl                     wl                  p50(ms)  ok
-hf_kernels_flash_attn3   cuda_attn_L128_bfloat16     0.91  True
-hf_kernels_flash_attn3   cuda_attn_L256_bfloat16     0.95  True
-hf_kernels_flash_attn3   cuda_attn_L320_bfloat16     1.02  True
-hf_kernels_flash_attn3   cuda_attn_L384_bfloat16     1.02  True
-hf_kernels_flash_attn3   cuda_attn_L448_bfloat16     1.18  True
+hf_kernels_flash_attn3   cuda_attn_L128_bfloat16     0.92  True
+hf_kernels_flash_attn3   cuda_attn_L256_bfloat16     0.96  True
+hf_kernels_flash_attn3   cuda_attn_L320_bfloat16     1.01  True
+hf_kernels_flash_attn3   cuda_attn_L384_bfloat16     1.03  True
+hf_kernels_flash_attn3   cuda_attn_L448_bfloat16     1.20  True
 hf_kernels_flash_attn3   cuda_attn_L512_bfloat16     1.18  True
 </pre></div>
 <div class="cell-stderr">
 Fetching 4 files:   0%|          | 0/4 [00:00&lt;?, ?it/s]
-Fetching 4 files:  50%|█████     | 2/4 [00:01&lt;00:01,  1.35it/s]
-Fetching 4 files: 100%|██████████| 4/4 [00:01&lt;00:00,  2.71it/s]
+Fetching 4 files:  50%|█████     | 2/4 [00:01&lt;00:01,  1.42it/s]
+Fetching 4 files: 100%|██████████| 4/4 [00:01&lt;00:00,  2.84it/s]
 </div>
 <div class="cell-artifacts">
 <h4>Artifacts:</h4>
diff --git a/flash_attn/impls/mem_efficient_attention.html b/flash_attn/impls/mem_efficient_attention.html
index a146d1ecfde534d0841c299486870e29ea70f3bb..e6d938b9f4ce572baa96778a2f0d11d329ead530 100644
--- a/flash_attn/impls/mem_efficient_attention.html
+++ b/flash_attn/impls/mem_efficient_attention.html
@@ -4110,7 +4110,7 @@ Cell: benchmark | 3.94s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
-<a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/mem_efficient_attention.md" target="_blank" class="github-btn">GitHub</a>
+<a href="https://github.com/huggingface/kernels-benchmarks/blob/main/flash_attn/impls/mem_efficient_attention.md" target="_blank" class="github-btn">GitHub</a>
 </div>
 <div id="code-benchmark" class="cell-code" data-lines="31">
 <div class="code-wrap">
@@ -4159,28 +4159,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L128_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                          torch_mem_eff         5.20%     361.468us        33.36%       2.319ms       2.319ms       0.000us         0.00%       5.387ms       5.387ms             1  
-                                          torch_mem_eff         0.00%       0.000us         0.00%       0.000us       0.000us       5.370ms       100.63%       5.370ms       5.370ms             1  
-                     aten::scaled_dot_product_attention         0.48%      33.240us         2.68%     186.333us      62.111us       0.000us         0.00%       4.719ms       1.573ms             3  
-          aten::_scaled_dot_product_efficient_attention         0.35%      24.389us         2.20%     153.093us      51.031us       0.000us         0.00%       4.719ms       1.573ms             3  
-                     aten::_efficient_attention_forward         0.53%      37.120us         1.50%     104.111us      34.704us       4.719ms        88.44%       4.719ms       1.573ms             3  
-fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem...         0.00%       0.000us         0.00%       0.000us       0.000us       4.719ms        88.44%       4.719ms       1.573ms             3  
-                                       aten::contiguous         0.18%      12.841us        24.53%       1.706ms     189.522us       0.000us         0.00%     667.809us      74.201us             9  
-                                            aten::clone         0.46%      31.899us        24.35%       1.693ms     188.095us       0.000us         0.00%     667.809us      74.201us             9  
-                                            aten::copy_         1.13%      78.352us        22.86%       1.589ms     176.604us     617.121us        11.56%     667.809us      74.201us             9  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     617.121us        11.56%     617.121us      68.569us             9  
-                                Activity Buffer Request        20.52%       1.427ms        20.52%       1.427ms       1.427ms      50.688us         0.95%      50.688us      50.688us             1  
-                                        aten::transpose         0.98%      68.237us         1.30%      90.074us       3.753us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.31%      21.837us         0.31%      21.837us       0.910us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.25%      17.541us         1.03%      71.521us       7.947us       0.000us         0.00%       0.000us       0.000us             9  
-                                            aten::empty         1.19%      82.429us         1.19%      82.429us       3.925us       0.000us         0.00%       0.000us       0.000us            21  
-                                       cudaLaunchKernel         1.61%     111.770us         1.61%     111.770us       9.314us       0.000us         0.00%       0.000us       0.000us            12  
-                                  cudaStreamIsCapturing         0.05%       3.512us         0.05%       3.512us       1.171us       0.000us         0.00%       0.000us       0.000us             3  
-                                   cudaFuncSetAttribute         0.11%       7.660us         0.11%       7.660us       2.553us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        66.64%       4.633ms        66.64%       4.633ms       4.633ms       0.000us         0.00%       0.000us       0.000us             1  
+                                          torch_mem_eff         5.14%     365.276us        32.53%       2.313ms       2.313ms       0.000us         0.00%       5.511ms       5.511ms             1  
+                                          torch_mem_eff         0.00%       0.000us         0.00%       0.000us       0.000us       5.492ms       100.58%       5.492ms       5.492ms             1  
+                     aten::scaled_dot_product_attention         0.43%      30.401us         2.47%     175.534us      58.511us       0.000us         0.00%       4.841ms       1.614ms             3  
+          aten::_scaled_dot_product_efficient_attention         0.33%      23.489us         2.04%     145.133us      48.378us       0.000us         0.00%       4.841ms       1.614ms             3  
+                     aten::_efficient_attention_forward         0.51%      36.572us         1.40%      99.733us      33.244us       4.841ms        88.65%       4.841ms       1.614ms             3  
+fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem...         0.00%       0.000us         0.00%       0.000us       0.000us       4.841ms        88.65%       4.841ms       1.614ms             3  
+                                       aten::contiguous         0.18%      12.851us        23.99%       1.706ms     189.523us       0.000us         0.00%     670.241us      74.471us             9  
+                                            aten::clone         0.46%      32.742us        23.80%       1.693ms     188.095us       0.000us         0.00%     670.241us      74.471us             9  
+                                            aten::copy_         1.05%      74.801us        22.33%       1.588ms     176.415us     619.776us        11.35%     670.241us      74.471us             9  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     619.776us        11.35%     619.776us      68.864us             9  
+                                Activity Buffer Request        20.17%       1.434ms        20.17%       1.434ms       1.434ms      50.465us         0.92%      50.465us      50.465us             1  
+                                        aten::transpose         0.93%      66.224us         1.25%      88.644us       3.693us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.32%      22.420us         0.32%      22.420us       0.934us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.25%      17.919us         1.02%      72.382us       8.042us       0.000us         0.00%       0.000us       0.000us             9  
+                                            aten::empty         1.14%      81.114us         1.14%      81.114us       3.863us       0.000us         0.00%       0.000us       0.000us            21  
+                                       cudaLaunchKernel         1.46%     103.973us         1.46%     103.973us       8.664us       0.000us         0.00%       0.000us       0.000us            12  
+                                  cudaStreamIsCapturing         0.04%       2.960us         0.04%       2.960us       0.987us       0.000us         0.00%       0.000us       0.000us             3  
+                                   cudaFuncSetAttribute         0.12%       8.310us         0.12%       8.310us       2.770us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        67.47%       4.798ms        67.47%       4.798ms       4.798ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 6.952ms
-Self CUDA time total: 5.336ms
+Self CPU time total: 7.111ms
+Self CUDA time total: 5.460ms
 
 
 
@@ -4190,28 +4190,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L256_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                          torch_mem_eff         3.61%     259.378us        29.44%       2.116ms       2.116ms       0.000us         0.00%       5.734ms       5.734ms             1  
-                                          torch_mem_eff         0.00%       0.000us         0.00%       0.000us       0.000us       5.688ms       100.15%       5.688ms       5.688ms             1  
-                     aten::scaled_dot_product_attention         0.27%      19.560us         2.06%     147.832us      49.277us       0.000us         0.00%       5.042ms       1.681ms             3  
-          aten::_scaled_dot_product_efficient_attention         0.27%      19.340us         1.78%     128.272us      42.757us       0.000us         0.00%       5.042ms       1.681ms             3  
-                     aten::_efficient_attention_forward         0.39%      28.380us         1.18%      84.990us      28.330us       5.042ms        88.79%       5.042ms       1.681ms             3  
-fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem...         0.00%       0.000us         0.00%       0.000us       0.000us       5.042ms        88.79%       5.042ms       1.681ms             3  
-                                       aten::contiguous         0.11%       8.118us        23.11%       1.661ms     184.525us       0.000us         0.00%     691.453us      76.828us             9  
-                                            aten::clone         0.32%      22.761us        23.00%       1.653ms     183.623us       0.000us         0.00%     691.453us      76.828us             9  
-                                            aten::copy_         0.95%      68.519us        21.65%       1.556ms     172.887us     636.925us        11.21%     691.453us      76.828us             9  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     636.925us        11.21%     636.925us      70.769us             9  
-                                Activity Buffer Request        19.69%       1.415ms        19.69%       1.415ms       1.415ms      54.528us         0.96%      54.528us      54.528us             1  
-                                        aten::transpose         0.75%      54.034us         1.00%      71.792us       2.991us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.25%      17.758us         0.25%      17.758us       0.740us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.18%      12.992us         1.03%      73.863us       8.207us       0.000us         0.00%       0.000us       0.000us             9  
-                                            aten::empty         1.22%      87.512us         1.22%      87.512us       4.167us       0.000us         0.00%       0.000us       0.000us            21  
-                                       cudaLaunchKernel         1.35%      96.951us         1.35%      96.951us       8.079us       0.000us         0.00%       0.000us       0.000us            12  
-                                  cudaStreamIsCapturing         0.03%       2.489us         0.03%       2.489us       0.830us       0.000us         0.00%       0.000us       0.000us             3  
-                                   cudaFuncSetAttribute         0.04%       3.130us         0.04%       3.130us       1.043us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        70.56%       5.071ms        70.56%       5.071ms       5.071ms       0.000us         0.00%       0.000us       0.000us             1  
+                                          torch_mem_eff         3.28%     242.746us        28.00%       2.075ms       2.075ms       0.000us         0.00%       5.933ms       5.933ms             1  
+                                          torch_mem_eff         0.00%       0.000us         0.00%       0.000us       0.000us       5.886ms       100.14%       5.886ms       5.886ms             1  
+                     aten::scaled_dot_product_attention         0.25%      18.240us         1.89%     140.073us      46.691us       0.000us         0.00%       5.241ms       1.747ms             3  
+          aten::_scaled_dot_product_efficient_attention         0.25%      18.689us         1.64%     121.833us      40.611us       0.000us         0.00%       5.241ms       1.747ms             3  
+                     aten::_efficient_attention_forward         0.38%      28.462us         1.09%      81.063us      27.021us       5.241ms        89.17%       5.241ms       1.747ms             3  
+fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem...         0.00%       0.000us         0.00%       0.000us       0.000us       5.241ms        89.17%       5.241ms       1.747ms             3  
+                                       aten::contiguous         0.10%       7.041us        22.26%       1.650ms     183.285us       0.000us         0.00%     691.103us      76.789us             9  
+                                            aten::clone         0.29%      21.342us        22.17%       1.643ms     182.503us       0.000us         0.00%     691.103us      76.789us             9  
+                                            aten::copy_         0.86%      63.451us        21.24%       1.574ms     174.872us     636.671us        10.83%     691.103us      76.789us             9  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     636.671us        10.83%     636.671us      70.741us             9  
+                                Activity Buffer Request        19.50%       1.445ms        19.50%       1.445ms       1.445ms      54.432us         0.93%      54.432us      54.432us             1  
+                                        aten::transpose         0.64%      47.650us         0.87%      64.701us       2.696us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.23%      17.051us         0.23%      17.051us       0.710us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.16%      11.589us         0.64%      47.330us       5.259us       0.000us         0.00%       0.000us       0.000us             9  
+                                            aten::empty         0.82%      60.521us         0.82%      60.521us       2.882us       0.000us         0.00%       0.000us       0.000us            21  
+                                       cudaLaunchKernel         1.19%      88.044us         1.19%      88.044us       7.337us       0.000us         0.00%       0.000us       0.000us            12  
+                                  cudaStreamIsCapturing         0.03%       2.420us         0.03%       2.420us       0.807us       0.000us         0.00%       0.000us       0.000us             3  
+                                   cudaFuncSetAttribute         0.04%       3.030us         0.04%       3.030us       1.010us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        72.00%       5.335ms        72.00%       5.335ms       5.335ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 7.187ms
-Self CUDA time total: 5.679ms
+Self CPU time total: 7.410ms
+Self CUDA time total: 5.878ms
 
 
 
@@ -4221,28 +4221,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L320_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                          torch_mem_eff         3.31%     247.873us        28.16%       2.111ms       2.111ms       0.000us         0.00%       6.014ms       6.014ms             1  
-                                          torch_mem_eff         0.00%       0.000us         0.00%       0.000us       0.000us       5.964ms       100.13%       5.964ms       5.964ms             1  
-                     aten::scaled_dot_product_attention         0.26%      19.681us         1.94%     145.404us      48.468us       0.000us         0.00%       5.300ms       1.767ms             3  
-          aten::_scaled_dot_product_efficient_attention         0.25%      18.780us         1.68%     125.723us      41.908us       0.000us         0.00%       5.300ms       1.767ms             3  
-                     aten::_efficient_attention_forward         0.40%      29.910us         1.12%      83.752us      27.917us       5.300ms        89.00%       5.300ms       1.767ms             3  
-fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem...         0.00%       0.000us         0.00%       0.000us       0.000us       5.300ms        89.00%       5.300ms       1.767ms             3  
-                                       aten::contiguous         0.10%       7.548us        22.32%       1.673ms     185.921us       0.000us         0.00%     713.444us      79.272us             9  
-                                            aten::clone         0.29%      21.851us        22.22%       1.666ms     185.082us       0.000us         0.00%     713.444us      79.272us             9  
-                                            aten::copy_         0.89%      66.441us        21.22%       1.591ms     176.813us     655.331us        11.00%     713.444us      79.272us             9  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     655.331us        11.00%     655.331us      72.815us             9  
-                                Activity Buffer Request        19.37%       1.452ms        19.37%       1.452ms       1.452ms      58.113us         0.98%      58.113us      58.113us             1  
-                                        aten::transpose         0.68%      50.773us         0.90%      67.843us       2.827us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.23%      17.070us         0.23%      17.070us       0.711us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.16%      12.290us         0.70%      52.570us       5.841us       0.000us         0.00%       0.000us       0.000us             9  
-                                            aten::empty         0.87%      64.980us         0.87%      64.980us       3.094us       0.000us         0.00%       0.000us       0.000us            21  
-                                       cudaLaunchKernel         1.28%      96.085us         1.28%      96.085us       8.007us       0.000us         0.00%       0.000us       0.000us            12  
-                                  cudaStreamIsCapturing         0.03%       2.520us         0.03%       2.520us       0.840us       0.000us         0.00%       0.000us       0.000us             3  
-                                   cudaFuncSetAttribute         0.04%       3.050us         0.04%       3.050us       1.017us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        71.84%       5.386ms        71.84%       5.386ms       5.386ms       0.000us         0.00%       0.000us       0.000us             1  
+                                          torch_mem_eff         3.21%     244.055us        27.47%       2.092ms       2.092ms       0.000us         0.00%       6.130ms       6.130ms             1  
+                                          torch_mem_eff         0.00%       0.000us         0.00%       0.000us       0.000us       6.080ms       100.14%       6.080ms       6.080ms             1  
+                     aten::scaled_dot_product_attention         0.23%      17.641us         1.86%     141.944us      47.315us       0.000us         0.00%       5.414ms       1.805ms             3  
+          aten::_scaled_dot_product_efficient_attention         0.25%      19.359us         1.63%     124.303us      41.434us       0.000us         0.00%       5.414ms       1.805ms             3  
+                     aten::_efficient_attention_forward         0.37%      28.219us         1.06%      80.592us      26.864us       5.414ms        89.17%       5.414ms       1.805ms             3  
+fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem...         0.00%       0.000us         0.00%       0.000us       0.000us       5.414ms        89.17%       5.414ms       1.805ms             3  
+                                       aten::contiguous         0.11%       8.060us        21.81%       1.661ms     184.510us       0.000us         0.00%     716.192us      79.577us             9  
+                                            aten::clone         0.29%      22.431us        21.70%       1.653ms     183.615us       0.000us         0.00%     716.192us      79.577us             9  
+                                            aten::copy_         0.81%      61.641us        20.75%       1.580ms     175.564us     657.728us        10.83%     716.192us      79.577us             9  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     657.728us        10.83%     657.728us      73.081us             9  
+                                Activity Buffer Request        19.08%       1.453ms        19.08%       1.453ms       1.453ms      58.464us         0.96%      58.464us      58.464us             1  
+                                        aten::transpose         0.69%      52.203us         0.92%      69.763us       2.907us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.23%      17.560us         0.23%      17.560us       0.732us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.15%      11.581us         0.66%      50.023us       5.558us       0.000us         0.00%       0.000us       0.000us             9  
+                                            aten::empty         0.84%      63.785us         0.84%      63.785us       3.037us       0.000us         0.00%       0.000us       0.000us            21  
+                                       cudaLaunchKernel         1.14%      86.832us         1.14%      86.832us       7.236us       0.000us         0.00%       0.000us       0.000us            12  
+                                  cudaStreamIsCapturing         0.03%       2.250us         0.03%       2.250us       0.750us       0.000us         0.00%       0.000us       0.000us             3  
+                                   cudaFuncSetAttribute         0.04%       3.260us         0.04%       3.260us       1.087us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        72.53%       5.522ms        72.53%       5.522ms       5.522ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 7.498ms
-Self CUDA time total: 5.956ms
+Self CPU time total: 7.614ms
+Self CUDA time total: 6.072ms
 
 
 
@@ -4252,28 +4252,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L384_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                          torch_mem_eff         3.20%     247.803us        30.17%       2.338ms       2.338ms       0.000us         0.00%       6.050ms       6.050ms             1  
-                                          torch_mem_eff         0.00%       0.000us         0.00%       0.000us       0.000us       6.000ms       100.13%       6.000ms       6.000ms             1  
-                     aten::scaled_dot_product_attention         0.37%      28.670us         2.04%     158.093us      52.698us       0.000us         0.00%       5.339ms       1.780ms             3  
-          aten::_scaled_dot_product_efficient_attention         0.26%      20.220us         1.67%     129.423us      43.141us       0.000us         0.00%       5.339ms       1.780ms             3  
-                     aten::_efficient_attention_forward         0.38%      29.560us         1.08%      83.863us      27.954us       5.339ms        89.10%       5.339ms       1.780ms             3  
-fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem...         0.00%       0.000us         0.00%       0.000us       0.000us       5.339ms        89.10%       5.339ms       1.780ms             3  
-                                       aten::contiguous         0.10%       7.610us        24.36%       1.887ms     209.722us       0.000us         0.00%     711.328us      79.036us             9  
-                                            aten::clone         0.28%      21.914us        24.26%       1.880ms     208.876us       0.000us         0.00%     711.328us      79.036us             9  
-                                            aten::copy_         0.87%      67.261us        23.30%       1.806ms     200.640us     653.248us        10.90%     711.328us      79.036us             9  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     653.248us        10.90%     653.248us      72.583us             9  
-                                Activity Buffer Request        18.39%       1.425ms        18.39%       1.425ms       1.425ms      58.080us         0.97%      58.080us      58.080us             1  
-                                        aten::transpose         0.68%      52.310us         0.90%      69.650us       2.902us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.22%      17.340us         0.22%      17.340us       0.723us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.16%      12.088us         0.67%      52.209us       5.801us       0.000us         0.00%       0.000us       0.000us             9  
-                                            aten::empty         0.84%      64.993us         0.84%      64.993us       3.095us       0.000us         0.00%       0.000us       0.000us            21  
-                                       cudaLaunchKernel         4.36%     337.546us         4.36%     337.546us      28.129us       0.000us         0.00%       0.000us       0.000us            12  
-                                  cudaStreamIsCapturing         0.03%       2.491us         0.03%       2.491us       0.830us       0.000us         0.00%       0.000us       0.000us             3  
-                                   cudaFuncSetAttribute         0.04%       3.020us         0.04%       3.020us       1.007us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        69.83%       5.411ms        69.83%       5.411ms       5.411ms       0.000us         0.00%       0.000us       0.000us             1  
+                                          torch_mem_eff         3.16%     248.365us        29.29%       2.300ms       2.300ms       0.000us         0.00%       6.163ms       6.163ms             1  
+                                          torch_mem_eff         0.00%       0.000us         0.00%       0.000us       0.000us       6.114ms       100.14%       6.114ms       6.114ms             1  
+                     aten::scaled_dot_product_attention         0.24%      19.232us         1.82%     142.774us      47.591us       0.000us         0.00%       5.452ms       1.817ms             3  
+          aten::_scaled_dot_product_efficient_attention         0.25%      19.461us         1.57%     123.542us      41.181us       0.000us         0.00%       5.452ms       1.817ms             3  
+                     aten::_efficient_attention_forward         0.37%      29.029us         1.03%      80.672us      26.891us       5.452ms        89.29%       5.452ms       1.817ms             3  
+fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem...         0.00%       0.000us         0.00%       0.000us       0.000us       5.452ms        89.29%       5.452ms       1.817ms             3  
+                                       aten::contiguous         0.10%       7.931us        23.78%       1.867ms     207.435us       0.000us         0.00%     711.072us      79.008us             9  
+                                            aten::clone         0.30%      23.532us        23.68%       1.859ms     206.554us       0.000us         0.00%     711.072us      79.008us             9  
+                                            aten::copy_         0.81%      63.779us        22.73%       1.785ms     198.306us     653.792us        10.71%     711.072us      79.008us             9  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     653.792us        10.71%     653.792us      72.644us             9  
+                                Activity Buffer Request        18.59%       1.459ms        18.59%       1.459ms       1.459ms      57.280us         0.94%      57.280us      57.280us             1  
+                                        aten::transpose         0.62%      48.610us         0.83%      65.130us       2.714us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.21%      16.520us         0.21%      16.520us       0.688us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.16%      12.281us         0.65%      50.702us       5.634us       0.000us         0.00%       0.000us       0.000us             9  
+                                            aten::empty         0.80%      62.502us         0.80%      62.502us       2.976us       0.000us         0.00%       0.000us       0.000us            21  
+                                       cudaLaunchKernel         3.60%     282.729us         3.60%     282.729us      23.561us       0.000us         0.00%       0.000us       0.000us            12  
+                                  cudaStreamIsCapturing         0.03%       2.471us         0.03%       2.471us       0.824us       0.000us         0.00%       0.000us       0.000us             3  
+                                   cudaFuncSetAttribute         0.05%       4.120us         0.05%       4.120us       1.373us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        70.71%       5.551ms        70.71%       5.551ms       5.551ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 7.749ms
-Self CUDA time total: 5.992ms
+Self CPU time total: 7.851ms
+Self CUDA time total: 6.106ms
 
 
 
@@ -4283,28 +4283,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L448_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                          torch_mem_eff         3.22%     253.272us        29.03%       2.283ms       2.283ms       0.000us         0.00%       6.248ms       6.248ms             1  
-                                          torch_mem_eff         0.00%       0.000us         0.00%       0.000us       0.000us       6.196ms       100.13%       6.196ms       6.196ms             1  
-                     aten::scaled_dot_product_attention         0.25%      19.441us         2.25%     176.884us      58.961us       0.000us         0.00%       5.524ms       1.841ms             3  
-          aten::_scaled_dot_product_efficient_attention         0.26%      20.811us         2.00%     157.443us      52.481us       0.000us         0.00%       5.524ms       1.841ms             3  
-                     aten::_efficient_attention_forward         0.41%      31.883us         1.42%     111.902us      37.301us       5.524ms        89.27%       5.524ms       1.841ms             3  
-fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem...         0.00%       0.000us         0.00%       0.000us       0.000us       5.524ms        89.27%       5.524ms       1.841ms             3  
-                                       aten::contiguous         0.10%       7.580us        22.97%       1.807ms     200.732us       0.000us         0.00%     724.035us      80.448us             9  
-                                            aten::clone         0.28%      22.150us        22.88%       1.799ms     199.890us       0.000us         0.00%     724.035us      80.448us             9  
-                                            aten::copy_         0.85%      67.019us        21.94%       1.725ms     191.709us     664.226us        10.73%     724.035us      80.448us             9  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     664.226us        10.73%     664.226us      73.803us             9  
-                                Activity Buffer Request        18.12%       1.425ms        18.12%       1.425ms       1.425ms      59.809us         0.97%      59.809us      59.809us             1  
-                                        aten::transpose         0.68%      53.201us         0.91%      71.182us       2.966us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.23%      17.981us         0.23%      17.981us       0.749us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.15%      12.001us         0.65%      51.482us       5.720us       0.000us         0.00%       0.000us       0.000us             9  
-                                            aten::empty         0.81%      63.729us         0.81%      63.729us       3.035us       0.000us         0.00%       0.000us       0.000us            21  
-                                       cudaLaunchKernel         3.60%     283.426us         3.60%     283.426us      23.619us       0.000us         0.00%       0.000us       0.000us            12  
-                                  cudaStreamIsCapturing         0.03%       2.490us         0.03%       2.490us       0.830us       0.000us         0.00%       0.000us       0.000us             3  
-                                   cudaFuncSetAttribute         0.04%       2.980us         0.04%       2.980us       0.993us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        70.97%       5.581ms        70.97%       5.581ms       5.581ms       0.000us         0.00%       0.000us       0.000us             1  
+                                          torch_mem_eff         3.01%     243.675us        28.03%       2.272ms       2.272ms       0.000us         0.00%       6.451ms       6.451ms             1  
+                                          torch_mem_eff         0.00%       0.000us         0.00%       0.000us       0.000us       6.399ms       100.13%       6.399ms       6.399ms             1  
+                     aten::scaled_dot_product_attention         0.23%      18.671us         1.77%     143.224us      47.741us       0.000us         0.00%       5.726ms       1.909ms             3  
+          aten::_scaled_dot_product_efficient_attention         0.24%      19.652us         1.54%     124.553us      41.518us       0.000us         0.00%       5.726ms       1.909ms             3  
+                     aten::_efficient_attention_forward         0.35%      28.317us         0.99%      80.642us      26.881us       5.726ms        89.60%       5.726ms       1.909ms             3  
+fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem...         0.00%       0.000us         0.00%       0.000us       0.000us       5.726ms        89.60%       5.726ms       1.909ms             3  
+                                       aten::contiguous         0.10%       7.791us        22.70%       1.840ms     204.460us       0.000us         0.00%     725.025us      80.558us             9  
+                                            aten::clone         0.29%      23.489us        22.61%       1.832ms     203.594us       0.000us         0.00%     725.025us      80.558us             9  
+                                            aten::copy_         0.81%      65.293us        21.68%       1.757ms     195.223us     664.641us        10.40%     725.025us      80.558us             9  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     664.641us        10.40%     664.641us      73.849us             9  
+                                Activity Buffer Request        17.77%       1.440ms        17.77%       1.440ms       1.440ms      60.384us         0.94%      60.384us      60.384us             1  
+                                        aten::transpose         0.63%      51.151us         0.85%      69.251us       2.885us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.22%      18.100us         0.22%      18.100us       0.754us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.15%      11.960us         0.64%      51.852us       5.761us       0.000us         0.00%       0.000us       0.000us             9  
+                                            aten::empty         0.79%      64.314us         0.79%      64.314us       3.063us       0.000us         0.00%       0.000us       0.000us            21  
+                                       cudaLaunchKernel         3.36%     272.117us         3.36%     272.117us      22.676us       0.000us         0.00%       0.000us       0.000us            12  
+                                  cudaStreamIsCapturing         0.03%       2.500us         0.03%       2.500us       0.833us       0.000us         0.00%       0.000us       0.000us             3  
+                                   cudaFuncSetAttribute         0.06%       4.532us         0.06%       4.532us       1.511us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        71.97%       5.833ms        71.97%       5.833ms       5.833ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 7.864ms
-Self CUDA time total: 6.188ms
+Self CPU time total: 8.105ms
+Self CUDA time total: 6.391ms
 
 
 
@@ -4314,37 +4314,37 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L512_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                          torch_mem_eff         3.10%     256.636us        27.41%       2.272ms       2.272ms       0.000us         0.00%       6.685ms       6.685ms             1  
-                                          torch_mem_eff         0.00%       0.000us         0.00%       0.000us       0.000us       6.632ms       100.12%       6.632ms       6.632ms             1  
-                     aten::scaled_dot_product_attention         0.23%      18.791us         1.80%     149.483us      49.828us       0.000us         0.00%       5.954ms       1.985ms             3  
-          aten::_scaled_dot_product_efficient_attention         0.24%      19.642us         1.58%     130.692us      43.564us       0.000us         0.00%       5.954ms       1.985ms             3  
-                     aten::_efficient_attention_forward         0.40%      33.027us         1.05%      86.901us      28.967us       5.954ms        89.88%       5.954ms       1.985ms             3  
-fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem...         0.00%       0.000us         0.00%       0.000us       0.000us       5.954ms        89.88%       5.954ms       1.985ms             3  
-                                       aten::contiguous         0.09%       7.531us        21.68%       1.797ms     199.660us       0.000us         0.00%     731.136us      81.237us             9  
-                                            aten::clone         0.27%      22.649us        21.59%       1.789ms     198.823us       0.000us         0.00%     731.136us      81.237us             9  
-                                            aten::copy_         0.82%      67.700us        20.66%       1.712ms     190.261us     670.176us        10.12%     731.136us      81.237us             9  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     670.176us        10.12%     670.176us      74.464us             9  
-                                Activity Buffer Request        17.30%       1.434ms        17.30%       1.434ms       1.434ms      60.960us         0.92%      60.960us      60.960us             1  
-                                        aten::transpose         0.90%      75.001us         1.12%      92.890us       3.870us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.22%      17.889us         0.22%      17.889us       0.745us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::empty_like         0.15%      12.259us         0.66%      54.410us       6.046us       0.000us         0.00%       0.000us       0.000us             9  
-                                            aten::empty         0.81%      67.133us         0.81%      67.133us       3.197us       0.000us         0.00%       0.000us       0.000us            21  
-                                       cudaLaunchKernel         2.82%     234.057us         2.82%     234.057us      19.505us       0.000us         0.00%       0.000us       0.000us            12  
-                                  cudaStreamIsCapturing         0.03%       2.420us         0.03%       2.420us       0.807us       0.000us         0.00%       0.000us       0.000us             3  
-                                   cudaFuncSetAttribute         0.04%       3.430us         0.04%       3.430us       1.143us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        72.59%       6.017ms        72.59%       6.017ms       6.017ms       0.000us         0.00%       0.000us       0.000us             1  
+                                          torch_mem_eff         2.88%     242.135us        27.00%       2.269ms       2.269ms       0.000us         0.00%       6.759ms       6.759ms             1  
+                                          torch_mem_eff         0.00%       0.000us         0.00%       0.000us       0.000us       6.705ms       100.12%       6.705ms       6.705ms             1  
+                     aten::scaled_dot_product_attention         0.21%      17.851us         1.72%     144.884us      48.295us       0.000us         0.00%       6.024ms       2.008ms             3  
+          aten::_scaled_dot_product_efficient_attention         0.23%      19.591us         1.51%     127.033us      42.344us       0.000us         0.00%       6.024ms       2.008ms             3  
+                     aten::_efficient_attention_forward         0.34%      28.520us         0.97%      81.532us      27.177us       6.024ms        89.96%       6.024ms       2.008ms             3  
+fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem...         0.00%       0.000us         0.00%       0.000us       0.000us       6.024ms        89.96%       6.024ms       2.008ms             3  
+                                       aten::contiguous         0.10%       8.099us        21.87%       1.838ms     204.242us       0.000us         0.00%     734.178us      81.575us             9  
+                                            aten::clone         0.28%      23.122us        21.78%       1.830ms     203.342us       0.000us         0.00%     734.178us      81.575us             9  
+                                            aten::copy_         0.74%      62.180us        20.86%       1.753ms     194.799us     672.322us        10.04%     734.178us      81.575us             9  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     672.322us        10.04%     672.322us      74.702us             9  
+                                Activity Buffer Request        17.19%       1.445ms        17.19%       1.445ms       1.445ms      61.856us         0.92%      61.856us      61.856us             1  
+                                        aten::transpose         0.62%      52.351us         0.83%      70.022us       2.918us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.21%      17.671us         0.21%      17.671us       0.736us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.15%      12.653us         0.64%      53.763us       5.974us       0.000us         0.00%       0.000us       0.000us             9  
+                                            aten::empty         0.79%      66.761us         0.79%      66.761us       3.179us       0.000us         0.00%       0.000us       0.000us            21  
+                                       cudaLaunchKernel         3.19%     267.907us         3.19%     267.907us      22.326us       0.000us         0.00%       0.000us       0.000us            12  
+                                  cudaStreamIsCapturing         0.03%       2.430us         0.03%       2.430us       0.810us       0.000us         0.00%       0.000us       0.000us             3  
+                                   cudaFuncSetAttribute         0.04%       3.350us         0.04%       3.350us       1.117us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        73.00%       6.134ms        73.00%       6.134ms       6.134ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 8.289ms
-Self CUDA time total: 6.624ms
+Self CPU time total: 8.404ms
+Self CUDA time total: 6.697ms
 
 
 impl                     wl                  p50(ms)  ok
-torch_mem_eff            cuda_attn_L128_bfloat16     1.81  True
-torch_mem_eff            cuda_attn_L256_bfloat16     1.88  True
-torch_mem_eff            cuda_attn_L320_bfloat16     1.97  True
-torch_mem_eff            cuda_attn_L384_bfloat16     1.97  True
-torch_mem_eff            cuda_attn_L448_bfloat16     2.09  True
-torch_mem_eff            cuda_attn_L512_bfloat16     2.22  True
+torch_mem_eff            cuda_attn_L128_bfloat16     1.85  True
+torch_mem_eff            cuda_attn_L256_bfloat16     1.95  True
+torch_mem_eff            cuda_attn_L320_bfloat16     1.99  True
+torch_mem_eff            cuda_attn_L384_bfloat16     2.07  True
+torch_mem_eff            cuda_attn_L448_bfloat16     2.06  True
+torch_mem_eff            cuda_attn_L512_bfloat16     2.25  True
 </pre></div>
 <div class="cell-artifacts">
 <h4>Artifacts:</h4>
diff --git a/flash_attn/impls/sage_attention.html b/flash_attn/impls/sage_attention.html
index fce8d8891e35f4da7c7b93129ab9c68bf413d0a6..c964f0f922939bcdffdf70f7e986e24de2938dac 100644
--- a/flash_attn/impls/sage_attention.html
+++ b/flash_attn/impls/sage_attention.html
@@ -4104,13 +4104,14 @@ body[data-tool="eraser"] .main-content {
 <span class="collapse-indicators">
 <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span> 
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
- <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
+ <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
 </span> | 
-Cell: benchmark | 4.12s
+Cell: benchmark | 4.69s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
-<a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/sage_attention.md" target="_blank" class="github-btn">GitHub</a>
+<a href="https://github.com/huggingface/kernels-benchmarks/blob/main/flash_attn/impls/sage_attention.md" target="_blank" class="github-btn">GitHub</a>
+<a href="https://huggingface.co/kernels-community/sage_attention" target="_blank" class="hf-btn">🤗 HF</a>
 </div>
 <div id="code-benchmark" class="cell-code" data-lines="32">
 <div class="code-wrap">
@@ -4155,24 +4156,27 @@ Cell: benchmark | 4.12s
 <div class="cell-stdout"><pre class="stdout-text">Running attention benchmark on cuda with 6 workloads.
 impl                     wl                  p50(ms)  ok
 sage_int8_fp16           cuda_attn_L128_bfloat16    FAIL  False
-  Error: module &#x27;sage_attention_cb34d81dafacbad9&#x27; has no attribute &#x27;fwd&#x27;
+  Error: module &#x27;sage_attention_ef0573391bb63704&#x27; has no attribute &#x27;fwd&#x27;
 sage_int8_fp16           cuda_attn_L256_bfloat16    FAIL  False
-  Error: module &#x27;sage_attention_cb34d81dafacbad9&#x27; has no attribute &#x27;fwd&#x27;
+  Error: module &#x27;sage_attention_ef0573391bb63704&#x27; has no attribute &#x27;fwd&#x27;
 sage_int8_fp16           cuda_attn_L320_bfloat16    FAIL  False
-  Error: module &#x27;sage_attention_cb34d81dafacbad9&#x27; has no attribute &#x27;fwd&#x27;
+  Error: module &#x27;sage_attention_ef0573391bb63704&#x27; has no attribute &#x27;fwd&#x27;
 sage_int8_fp16           cuda_attn_L384_bfloat16    FAIL  False
-  Error: module &#x27;sage_attention_cb34d81dafacbad9&#x27; has no attribute &#x27;fwd&#x27;
+  Error: module &#x27;sage_attention_ef0573391bb63704&#x27; has no attribute &#x27;fwd&#x27;
 sage_int8_fp16           cuda_attn_L448_bfloat16    FAIL  False
-  Error: module &#x27;sage_attention_cb34d81dafacbad9&#x27; has no attribute &#x27;fwd&#x27;
+  Error: module &#x27;sage_attention_ef0573391bb63704&#x27; has no attribute &#x27;fwd&#x27;
 sage_int8_fp16           cuda_attn_L512_bfloat16    FAIL  False
-  Error: module &#x27;sage_attention_cb34d81dafacbad9&#x27; has no attribute &#x27;fwd&#x27;
+  Error: module &#x27;sage_attention_ef0573391bb63704&#x27; has no attribute &#x27;fwd&#x27;
 </pre></div>
-<div class="cell-stderr">
-Fetching 11 files:   0%|          | 0/11 [00:00&lt;?, ?it/s]
-Fetching 11 files:  18%|█▊        | 2/11 [00:00&lt;00:00, 17.35it/s]
-Fetching 11 files:  73%|███████▎  | 8/11 [00:00&lt;00:00, 15.18it/s]
-Fetching 11 files: 100%|██████████| 11/11 [00:00&lt;00:00, 21.06it/s]
+<div class="uv-install-logs" id="uv-logs-benchmark">
+<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
+<div class="uv-logs-content" style="display: none;">
+Installed 15 packages in 14ms
 </div>
+</div>
+<div class="cell-stderr">Fetching 11 files:   0%|          | 0/11 [00:00&lt;?, ?it/s]
+Fetching 11 files:  73%|███████▎  | 8/11 [00:00&lt;00:00, 11.73it/s]
+Fetching 11 files: 100%|██████████| 11/11 [00:00&lt;00:00, 16.12it/s]</div>
 <div class="cell-artifacts">
 <h4>Artifacts:</h4>
 <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
diff --git a/flash_attn/impls/xformers.html b/flash_attn/impls/xformers.html
index e1ecdb582681c3ec96cb0b0c54cc3f176cd9f9eb..3e1c781413a91f403396426a1c99ea9ec7673187 100644
--- a/flash_attn/impls/xformers.html
+++ b/flash_attn/impls/xformers.html
@@ -4106,11 +4106,11 @@ body[data-tool="eraser"] .main-content {
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
 </span> | 
-Cell: benchmark | 5.04s
+Cell: benchmark | 33.71s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
-<a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/xformers.md" target="_blank" class="github-btn">GitHub</a>
+<a href="https://github.com/huggingface/kernels-benchmarks/blob/main/flash_attn/impls/xformers.md" target="_blank" class="github-btn">GitHub</a>
 </div>
 <div id="code-benchmark" class="cell-code" data-lines="30">
 <div class="code-wrap">
@@ -4158,21 +4158,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L128_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                          xformers_meff        11.46%     506.438us        53.66%       2.372ms       2.372ms       0.000us         0.00%       3.500ms       3.500ms             1  
-                             xformers_flash3::flash_fwd         4.48%     198.083us        41.44%       1.831ms     610.487us       0.000us         0.00%       3.500ms       1.167ms             3  
-                                      flash_attn_3::fwd         1.73%      76.649us        36.96%       1.633ms     544.459us       2.610ms       100.00%       3.500ms       1.167ms             3  
-                                          xformers_meff         0.00%       0.000us         0.00%       0.000us       0.000us       2.612ms       100.06%       2.612ms       2.612ms             1  
-void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       2.610ms       100.00%       2.610ms     870.154us             3  
-                                Activity Buffer Request        33.26%       1.470ms        33.26%       1.470ms       1.470ms     889.248us        34.06%     889.248us     889.248us             1  
-                                            aten::empty         0.80%      35.182us         0.80%      35.182us       5.864us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.25%      10.920us         0.25%      10.920us       3.640us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         0.92%      40.501us         0.92%      40.501us      13.500us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::reshape         0.27%      12.132us         0.77%      33.872us       5.645us       0.000us         0.00%       0.000us       0.000us             6  
-                                             aten::view         0.49%      21.740us         0.49%      21.740us       3.623us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        46.34%       2.048ms        46.34%       2.048ms       2.048ms       0.000us         0.00%       0.000us       0.000us             1  
+                                          xformers_meff        10.98%     488.134us        52.82%       2.349ms       2.349ms       0.000us         0.00%       3.539ms       3.539ms             1  
+                             xformers_flash3::flash_fwd         4.45%     198.034us        41.02%       1.824ms     608.009us       0.000us         0.00%       3.539ms       1.180ms             3  
+                                      flash_attn_3::fwd         1.81%      80.354us        36.57%       1.626ms     541.997us       2.647ms       100.00%       3.539ms       1.180ms             3  
+                                          xformers_meff         0.00%       0.000us         0.00%       0.000us       0.000us       2.648ms       100.06%       2.648ms       2.648ms             1  
+void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       2.647ms       100.00%       2.647ms     882.203us             3  
+                                Activity Buffer Request        32.65%       1.452ms        32.65%       1.452ms       1.452ms     892.891us        33.74%     892.891us     892.891us             1  
+                                            aten::empty         0.78%      34.470us         0.78%      34.470us       5.745us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.26%      11.370us         0.26%      11.370us       3.790us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.08%      47.851us         1.08%      47.851us      15.950us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::reshape         0.28%      12.261us         0.82%      36.420us       6.070us       0.000us         0.00%       0.000us       0.000us             6  
+                                             aten::view         0.54%      24.159us         0.54%      24.159us       4.026us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        47.18%       2.098ms        47.18%       2.098ms       2.098ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 4.420ms
-Self CUDA time total: 2.610ms
+Self CPU time total: 4.447ms
+Self CUDA time total: 2.647ms
 
 
 
@@ -4182,21 +4182,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L256_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                          xformers_meff         7.25%     318.297us        46.47%       2.042ms       2.042ms       0.000us         0.00%       3.722ms       3.722ms             1  
-                             xformers_flash3::flash_fwd         3.37%     148.131us        38.68%       1.699ms     566.453us       0.000us         0.00%       3.722ms       1.241ms             3  
-                                      flash_attn_3::fwd         1.17%      51.450us        35.31%       1.551ms     517.076us       2.780ms       100.00%       3.722ms       1.241ms             3  
-                                          xformers_meff         0.00%       0.000us         0.00%       0.000us       0.000us       2.782ms       100.05%       2.782ms       2.782ms             1  
-void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       2.780ms       100.00%       2.780ms     926.692us             3  
-                                Activity Buffer Request        32.58%       1.431ms        32.58%       1.431ms       1.431ms     942.244us        33.89%     942.244us     942.244us             1  
-                                            aten::empty         0.66%      29.210us         0.66%      29.210us       4.868us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.13%       5.512us         0.13%       5.512us       1.837us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         0.77%      34.031us         0.77%      34.031us      11.344us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::reshape         0.21%       9.369us         0.54%      23.900us       3.983us       0.000us         0.00%       0.000us       0.000us             6  
-                                             aten::view         0.33%      14.531us         0.33%      14.531us       2.422us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        53.53%       2.351ms        53.53%       2.351ms       2.351ms       0.000us         0.00%       0.000us       0.000us             1  
+                                          xformers_meff         7.22%     318.208us        46.97%       2.070ms       2.070ms       0.000us         0.00%       3.700ms       3.700ms             1  
+                             xformers_flash3::flash_fwd         3.33%     146.973us        39.20%       1.728ms     575.898us       0.000us         0.00%       3.700ms       1.233ms             3  
+                                      flash_attn_3::fwd         1.20%      53.004us        35.87%       1.581ms     526.907us       2.767ms       100.00%       3.700ms       1.233ms             3  
+                                          xformers_meff         0.00%       0.000us         0.00%       0.000us       0.000us       2.769ms       100.05%       2.769ms       2.769ms             1  
+void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       2.767ms       100.00%       2.767ms     922.499us             3  
+                                Activity Buffer Request        33.12%       1.459ms        33.12%       1.459ms       1.459ms     932.857us        33.71%     932.857us     932.857us             1  
+                                            aten::empty         0.65%      28.790us         0.65%      28.790us       4.798us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.13%       5.860us         0.13%       5.860us       1.953us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         0.76%      33.580us         0.76%      33.580us      11.193us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::reshape         0.21%       9.291us         0.54%      23.901us       3.983us       0.000us         0.00%       0.000us       0.000us             6  
+                                             aten::view         0.33%      14.610us         0.33%      14.610us       2.435us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        53.03%       2.337ms        53.03%       2.337ms       2.337ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 4.393ms
-Self CUDA time total: 2.780ms
+Self CPU time total: 4.407ms
+Self CUDA time total: 2.767ms
 
 
 
@@ -4206,21 +4206,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L320_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                          xformers_meff         6.91%     309.504us        45.24%       2.025ms       2.025ms       0.000us         0.00%       3.854ms       3.854ms             1  
-                             xformers_flash3::flash_fwd         3.30%     147.756us        37.80%       1.692ms     563.990us       0.000us         0.00%       3.854ms       1.285ms             3  
-                                      flash_attn_3::fwd         1.19%      53.048us        34.50%       1.544ms     514.738us       2.875ms       100.00%       3.854ms       1.285ms             3  
-                                          xformers_meff         0.00%       0.000us         0.00%       0.000us       0.000us       2.877ms       100.05%       2.877ms       2.877ms             1  
-void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       2.875ms       100.00%       2.875ms     958.381us             3  
-                                Activity Buffer Request        31.77%       1.422ms        31.77%       1.422ms       1.422ms     979.266us        34.06%     979.266us     979.266us             1  
-                                            aten::empty         0.67%      29.790us         0.67%      29.790us       4.965us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.12%       5.570us         0.12%       5.570us       1.857us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         0.76%      33.852us         0.76%      33.852us      11.284us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::reshape         0.22%       9.920us         0.53%      23.660us       3.943us       0.000us         0.00%       0.000us       0.000us             6  
-                                             aten::view         0.31%      13.740us         0.31%      13.740us       2.290us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        54.76%       2.451ms        54.76%       2.451ms       2.451ms       0.000us         0.00%       0.000us       0.000us             1  
+                                          xformers_meff         6.87%     306.279us        45.67%       2.036ms       2.036ms       0.000us         0.00%       3.803ms       3.803ms             1  
+                             xformers_flash3::flash_fwd         3.28%     146.193us        38.29%       1.707ms     568.871us       0.000us         0.00%       3.803ms       1.268ms             3  
+                                      flash_attn_3::fwd         1.22%      54.360us        35.01%       1.560ms     520.140us       2.841ms       100.00%       3.803ms       1.268ms             3  
+                                          xformers_meff         0.00%       0.000us         0.00%       0.000us       0.000us       2.843ms       100.05%       2.843ms       2.843ms             1  
+void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       2.841ms       100.00%       2.841ms     947.064us             3  
+                                Activity Buffer Request        32.21%       1.435ms        32.21%       1.435ms       1.435ms     961.848us        33.85%     961.848us     961.848us             1  
+                                            aten::empty         0.68%      30.200us         0.68%      30.200us       5.033us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.12%       5.560us         0.12%       5.560us       1.853us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         0.78%      34.863us         0.78%      34.863us      11.621us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::reshape         0.20%       8.808us         0.51%      22.610us       3.768us       0.000us         0.00%       0.000us       0.000us             6  
+                                             aten::view         0.31%      13.802us         0.31%      13.802us       2.300us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        54.33%       2.422ms        54.33%       2.422ms       2.422ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 4.476ms
-Self CUDA time total: 2.875ms
+Self CPU time total: 4.457ms
+Self CUDA time total: 2.841ms
 
 
 
@@ -4230,21 +4230,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L384_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                          xformers_meff         6.53%     306.895us        47.96%       2.255ms       2.255ms       0.000us         0.00%       3.838ms       3.838ms             1  
-                             xformers_flash3::flash_fwd         3.09%     145.243us        40.94%       1.925ms     641.651us       0.000us         0.00%       3.838ms       1.279ms             3  
-                                      flash_attn_3::fwd         1.17%      55.062us        37.85%       1.780ms     593.237us       2.865ms       100.00%       3.838ms       1.279ms             3  
-                                          xformers_meff         0.00%       0.000us         0.00%       0.000us       0.000us       2.866ms       100.05%       2.866ms       2.866ms             1  
-void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       2.865ms       100.00%       2.865ms     954.931us             3  
-                                Activity Buffer Request        30.23%       1.421ms        30.23%       1.421ms       1.421ms     973.182us        33.97%     973.182us     973.182us             1  
-                                            aten::empty         0.63%      29.790us         0.63%      29.790us       4.965us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.11%       5.390us         0.11%       5.390us       1.797us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         5.70%     268.094us         5.70%     268.094us      89.365us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::reshape         0.19%       8.710us         0.49%      22.930us       3.822us       0.000us         0.00%       0.000us       0.000us             6  
-                                             aten::view         0.30%      14.220us         0.30%      14.220us       2.370us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        52.04%       2.447ms        52.04%       2.447ms       2.447ms       0.000us         0.00%       0.000us       0.000us             1  
+                                          xformers_meff         6.67%     311.798us        48.16%       2.253ms       2.253ms       0.000us         0.00%       3.854ms       3.854ms             1  
+                             xformers_flash3::flash_fwd         3.68%     172.144us        40.98%       1.917ms     638.949us       0.000us         0.00%       3.854ms       1.285ms             3  
+                                      flash_attn_3::fwd         1.19%      55.670us        37.30%       1.745ms     581.568us       2.881ms       100.00%       3.854ms       1.285ms             3  
+                                          xformers_meff         0.00%       0.000us         0.00%       0.000us       0.000us       2.883ms       100.05%       2.883ms       2.883ms             1  
+void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       2.881ms       100.00%       2.881ms     960.465us             3  
+                                Activity Buffer Request        30.77%       1.440ms        30.77%       1.440ms       1.440ms     972.603us        33.75%     972.603us     972.603us             1  
+                                            aten::empty         0.63%      29.580us         0.63%      29.580us       4.930us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.12%       5.801us         0.12%       5.801us       1.934us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         4.58%     214.036us         4.58%     214.036us      71.345us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::reshape         0.19%       9.019us         0.51%      24.051us       4.009us       0.000us         0.00%       0.000us       0.000us             6  
+                                             aten::view         0.32%      15.032us         0.32%      15.032us       2.505us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        51.84%       2.425ms        51.84%       2.425ms       2.425ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 4.702ms
-Self CUDA time total: 2.865ms
+Self CPU time total: 4.678ms
+Self CUDA time total: 2.881ms
 
 
 
@@ -4254,21 +4254,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L448_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                          xformers_meff         6.46%     328.735us        43.31%       2.206ms       2.206ms       0.000us         0.00%       4.477ms       4.477ms             1  
-                             xformers_flash3::flash_fwd         3.06%     155.642us        36.36%       1.852ms     617.231us       0.000us         0.00%       4.477ms       1.492ms             3  
-                                      flash_attn_3::fwd         1.12%      56.881us        33.30%       1.696ms     565.350us       3.348ms       100.00%       4.477ms       1.492ms             3  
-                                          xformers_meff         0.00%       0.000us         0.00%       0.000us       0.000us       3.349ms       100.04%       3.349ms       3.349ms             1  
-void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       3.348ms       100.00%       3.348ms       1.116ms             3  
-                                Activity Buffer Request        27.91%       1.421ms        27.91%       1.421ms       1.421ms       1.129ms        33.72%       1.129ms       1.129ms             1  
-                                            aten::empty         0.63%      32.251us         0.63%      32.251us       5.375us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.11%       5.740us         0.11%       5.740us       1.913us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         3.53%     179.913us         3.53%     179.913us      59.971us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::reshape         0.21%      10.692us         0.50%      25.231us       4.205us       0.000us         0.00%       0.000us       0.000us             6  
-                                             aten::view         0.29%      14.539us         0.29%      14.539us       2.423us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        56.69%       2.887ms        56.69%       2.887ms       2.887ms       0.000us         0.00%       0.000us       0.000us             1  
+                                          xformers_meff         5.88%     304.576us        42.22%       2.188ms       2.188ms       0.000us         0.00%       4.552ms       4.552ms             1  
+                             xformers_flash3::flash_fwd         2.84%     147.154us        35.91%       1.861ms     620.213us       0.000us         0.00%       4.552ms       1.517ms             3  
+                                      flash_attn_3::fwd         1.02%      52.961us        33.07%       1.713ms     571.161us       3.412ms       100.00%       4.552ms       1.517ms             3  
+                                          xformers_meff         0.00%       0.000us         0.00%       0.000us       0.000us       3.414ms       100.04%       3.414ms       3.414ms             1  
+void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       3.412ms       100.00%       3.412ms       1.137ms             3  
+                                Activity Buffer Request        27.95%       1.448ms        27.95%       1.448ms       1.448ms       1.140ms        33.41%       1.140ms       1.140ms             1  
+                                            aten::empty         0.56%      29.272us         0.56%      29.272us       4.879us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.12%       6.180us         0.12%       6.180us       2.060us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         3.41%     176.624us         3.41%     176.624us      58.875us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::reshape         0.17%       9.052us         0.44%      22.882us       3.814us       0.000us         0.00%       0.000us       0.000us             6  
+                                             aten::view         0.27%      13.830us         0.27%      13.830us       2.305us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        57.78%       2.994ms        57.78%       2.994ms       2.994ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 5.092ms
-Self CUDA time total: 3.348ms
+Self CPU time total: 5.182ms
+Self CUDA time total: 3.412ms
 
 
 
@@ -4278,37 +4278,83 @@ PROFILE TRACE: xformers_meff | cuda_attn_L512_bfloat16
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                          xformers_meff         6.24%     320.533us        43.45%       2.233ms       2.233ms       0.000us         0.00%       4.496ms       4.496ms             1  
-                             xformers_flash3::flash_fwd         2.90%     149.124us        36.73%       1.887ms     629.094us       0.000us         0.00%       4.496ms       1.499ms             3  
-                                      flash_attn_3::fwd         1.48%      76.290us        33.83%       1.738ms     579.386us       3.368ms       100.00%       4.496ms       1.499ms             3  
-                                          xformers_meff         0.00%       0.000us         0.00%       0.000us       0.000us       3.369ms       100.05%       3.369ms       3.369ms             1  
-void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       3.368ms       100.00%       3.368ms       1.123ms             3  
-                                Activity Buffer Request        28.33%       1.456ms        28.33%       1.456ms       1.456ms       1.129ms        33.51%       1.129ms       1.129ms             1  
-                                            aten::empty         0.58%      29.962us         0.58%      29.962us       4.994us       0.000us         0.00%       0.000us       0.000us             6  
-                                   cudaFuncSetAttribute         0.12%       6.240us         0.12%       6.240us       2.080us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         3.31%     169.832us         3.31%     169.832us      56.611us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::reshape         0.21%      10.672us         0.48%      24.873us       4.146us       0.000us         0.00%       0.000us       0.000us             6  
-                                             aten::view         0.28%      14.201us         0.28%      14.201us       2.367us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        56.55%       2.906ms        56.55%       2.906ms       2.906ms       0.000us         0.00%       0.000us       0.000us             1  
+                                          xformers_meff         5.58%     285.697us        41.87%       2.143ms       2.143ms       0.000us         0.00%       4.544ms       4.544ms             1  
+                             xformers_flash3::flash_fwd         2.91%     148.714us        35.83%       1.834ms     611.255us       0.000us         0.00%       4.544ms       1.515ms             3  
+                                      flash_attn_3::fwd         1.04%      53.311us        32.92%       1.685ms     561.684us       3.402ms       100.00%       4.544ms       1.515ms             3  
+                                          xformers_meff         0.00%       0.000us         0.00%       0.000us       0.000us       3.403ms       100.05%       3.403ms       3.403ms             1  
+void cutlass::device_kernel&lt;flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       3.402ms       100.00%       3.402ms       1.134ms             3  
+                                Activity Buffer Request        27.78%       1.422ms        27.78%       1.422ms       1.422ms       1.142ms        33.57%       1.142ms       1.142ms             1  
+                                            aten::empty         0.58%      29.640us         0.58%      29.640us       4.940us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.12%       5.990us         0.12%       5.990us       1.997us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         3.40%     174.134us         3.40%     174.134us      58.045us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::reshape         0.17%       8.543us         0.45%      23.191us       3.865us       0.000us         0.00%       0.000us       0.000us             6  
+                                             aten::view         0.29%      14.648us         0.29%      14.648us       2.441us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        58.13%       2.975ms        58.13%       2.975ms       2.975ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 5.138ms
-Self CUDA time total: 3.368ms
+Self CPU time total: 5.118ms
+Self CUDA time total: 3.402ms
 
 
 impl                     wl                  p50(ms)  ok
-xformers_meff            cuda_attn_L128_bfloat16     0.98  True
-xformers_meff            cuda_attn_L256_bfloat16     1.02  True
-xformers_meff            cuda_attn_L320_bfloat16     1.07  True
+xformers_meff            cuda_attn_L128_bfloat16     1.00  True
+xformers_meff            cuda_attn_L256_bfloat16     1.03  True
+xformers_meff            cuda_attn_L320_bfloat16     1.08  True
 xformers_meff            cuda_attn_L384_bfloat16     1.08  True
-xformers_meff            cuda_attn_L448_bfloat16     1.24  True
+xformers_meff            cuda_attn_L448_bfloat16     1.25  True
 xformers_meff            cuda_attn_L512_bfloat16     1.23  True
 </pre></div>
 <div class="uv-install-logs" id="uv-logs-benchmark">
 <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 <div class="uv-logs-content" style="display: none;">
+   Building kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
+Downloading networkx (1.9MiB)
+Downloading nvidia-cudnn-cu12 (674.0MiB)
+Downloading fonttools (4.7MiB)
+Downloading nvidia-cufft-cu12 (184.2MiB)
+Downloading numpy (16.2MiB)
+Downloading torch (846.9MiB)
+Downloading setuptools (1.1MiB)
+Downloading nvidia-cublas-cu12 (566.8MiB)
+Downloading sympy (6.0MiB)
+Downloading nvidia-cufile-cu12 (1.1MiB)
+Downloading nvidia-nccl-cu12 (307.4MiB)
+Downloading kiwisolver (1.4MiB)
+Downloading matplotlib (8.3MiB)
+Downloading triton (148.3MiB)
+Downloading nvidia-nvjitlink-cu12 (37.4MiB)
+Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
+Downloading nvidia-curand-cu12 (60.7MiB)
+Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
+Downloading nvidia-cusolver-cu12 (255.1MiB)
+Downloading nvidia-cusparselt-cu12 (273.9MiB)
 Downloading xformers (111.8MiB)
+Downloading pillow (6.7MiB)
+Downloading nvidia-cusparse-cu12 (274.9MiB)
+ Downloading nvidia-cufile-cu12
+ Downloading kiwisolver
+ Downloading setuptools
+ Downloading networkx
+ Downloading fonttools
+ Downloading pillow
+      Built kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
+ Downloading nvidia-cuda-cupti-cu12
+ Downloading matplotlib
+ Downloading sympy
+ Downloading numpy
+ Downloading nvidia-nvjitlink-cu12
+ Downloading nvidia-curand-cu12
+ Downloading nvidia-cuda-nvrtc-cu12
  Downloading xformers
-Installed 1 package in 13ms
+ Downloading triton
+ Downloading nvidia-cufft-cu12
+ Downloading nvidia-cusolver-cu12
+ Downloading nvidia-cusparse-cu12
+ Downloading nvidia-cusparselt-cu12
+ Downloading nvidia-nccl-cu12
+ Downloading nvidia-cublas-cu12
+ Downloading nvidia-cudnn-cu12
+ Downloading torch
+Installed 38 packages in 236ms
 </div>
 </div>
 <div class="cell-artifacts">
diff --git a/flash_attn/results/artifacts/combine/latency.svg b/flash_attn/results/artifacts/combine/latency.svg
index 0f51d77bf35af08b6174bc4df17db6fe30a4e491..31d30c5dcfa68f4fc35593a1422ddd982b5374d8 100644
--- a/flash_attn/results/artifacts/combine/latency.svg
+++ b/flash_attn/results/artifacts/combine/latency.svg
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c6390d15c17c1cced5612c62eb1fb07f7304765d3d9c2c842f634fd3107bbeaf
-size 24786
+oid sha256:520b28a43c879f6952cf0ddeade1438dbb5bd7caf01b6509254a4c68e9446ee6
+size 24783
diff --git a/flash_attn/results/combined_results.html b/flash_attn/results/combined_results.html
index dbe50dede3b447c779732c2f39dd59bfd2928e4f..0682107b1540718d4e870417450dee78797760de 100644
--- a/flash_attn/results/combined_results.html
+++ b/flash_attn/results/combined_results.html
@@ -4107,7 +4107,7 @@ body[data-tool="eraser"] .main-content {
   <rdf:RDF>
    <ns2:Work>
     <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
-    <dc:date>2025-10-30T15:53:53.940454</dc:date>
+    <dc:date>2025-10-31T20:14:18.946177</dc:date>
     <dc:format>image/svg+xml</dc:format>
     <dc:creator>
      <ns2:Agent>
@@ -4217,96 +4217,96 @@ body[data-tool="eraser"] .main-content {
    <g id="matplotlib.axis_2">
     <g id="ytick_1">
      <g id="grid-y--2" class="grid grid-y">
-      <path d="M 47.81 402.410473  L 835.361742 402.410473  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.81 406.365305  L 835.361742 406.365305  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_7">
       <defs>
        <path id="m0fca2865ba" d="M 0 0  L -3.5 0  " style="stroke: #000000; stroke-width: 0.8" />
       </defs>
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.81" y="402.410473" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.81" y="406.365305" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_7">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="406.209692" transform="rotate(-0 40.81 406.209692)">1.0</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="410.164524" transform="rotate(-0 40.81 410.164524)">1.0</text>
      </g>
     </g>
     <g id="ytick_2">
      <g id="grid-y--3" class="grid grid-y">
-      <path d="M 47.81 343.789654  L 835.361742 343.789654  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.81 348.61376  L 835.361742 348.61376  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_8">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.81" y="343.789654" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.81" y="348.61376" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_8">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="347.588873" transform="rotate(-0 40.81 347.588873)">1.2</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="352.412978" transform="rotate(-0 40.81 352.412978)">1.2</text>
      </g>
     </g>
     <g id="ytick_3">
      <g id="grid-y--4" class="grid grid-y">
-      <path d="M 47.81 285.168836  L 835.361742 285.168836  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.81 290.862214  L 835.361742 290.862214  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_9">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.81" y="285.168836" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.81" y="290.862214" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_9">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="288.968055" transform="rotate(-0 40.81 288.968055)">1.4</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="294.661433" transform="rotate(-0 40.81 294.661433)">1.4</text>
      </g>
     </g>
     <g id="ytick_4">
      <g id="grid-y--5" class="grid grid-y">
-      <path d="M 47.81 226.548018  L 835.361742 226.548018  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.81 233.110668  L 835.361742 233.110668  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_10">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.81" y="226.548018" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.81" y="233.110668" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_10">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="230.347236" transform="rotate(-0 40.81 230.347236)">1.6</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="236.909887" transform="rotate(-0 40.81 236.909887)">1.6</text>
      </g>
     </g>
     <g id="ytick_5">
      <g id="grid-y--6" class="grid grid-y">
-      <path d="M 47.81 167.927199  L 835.361742 167.927199  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.81 175.359123  L 835.361742 175.359123  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_11">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.81" y="167.927199" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.81" y="175.359123" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_11">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="171.726418" transform="rotate(-0 40.81 171.726418)">1.8</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="179.158342" transform="rotate(-0 40.81 179.158342)">1.8</text>
      </g>
     </g>
     <g id="ytick_6">
      <g id="grid-y--7" class="grid grid-y">
-      <path d="M 47.81 109.306381  L 835.361742 109.306381  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.81 117.607577  L 835.361742 117.607577  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_12">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.81" y="109.306381" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.81" y="117.607577" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_12">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="113.1056" transform="rotate(-0 40.81 113.1056)">2.0</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="121.406796" transform="rotate(-0 40.81 121.406796)">2.0</text>
      </g>
     </g>
     <g id="ytick_7">
      <g id="grid-y--8" class="grid grid-y">
-      <path d="M 47.81 50.685563  L 835.361742 50.685563  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.81 59.856031  L 835.361742 59.856031  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_13">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.81" y="50.685563" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.81" y="59.856031" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_13">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="54.484781" transform="rotate(-0 40.81 54.484781)">2.2</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="63.65525" transform="rotate(-0 40.81 63.65525)">2.2</text>
      </g>
     </g>
     <g id="label--y" class="ylabel">
@@ -4314,73 +4314,73 @@ body[data-tool="eraser"] .main-content {
     </g>
    </g>
    <g id="series--torch-flash-ma" class="series">
-    <path d="M 83.607806 338.320039  L 226.799032 324.329888  L 369.990258 318.590616  L 513.181484 313.901244  L 656.37271 271.916135  L 799.563935 259.376848  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 83.607806 344.244567  L 226.799032 326.470951  L 369.990258 319.632879  L 513.181484 311.200865  L 656.37271 263.410306  L 799.563935 258.605377  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="md7efaf3aec" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #1f77b4" />
     </defs>
     <g clip-path="url(#p09feef2583)">
-     <use ns4:href="#md7efaf3aec" x="83.607806" y="338.320039" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="226.799032" y="324.329888" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="369.990258" y="318.590616" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="513.181484" y="313.901244" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="656.37271" y="271.916135" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="799.563935" y="259.376848" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="83.607806" y="344.244567" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="226.799032" y="326.470951" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="369.990258" y="319.632879" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="513.181484" y="311.200865" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="656.37271" y="263.410306" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="799.563935" y="258.605377" style="fill: #1f77b4; stroke: #1f77b4" />
     </g>
    </g>
    <g id="series--torch-mem-eff" class="series">
-    <path d="M 83.607806 163.963846  L 226.799032 145.342943  L 369.990258 117.045795  L 513.181484 117.544365  L 656.37271 83.816291  L 799.563935 45.999414  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 83.607806 160.220133  L 226.799032 131.522812  L 369.990258 119.284971  L 513.181484 97.052936  L 656.37271 99.854174  L 799.563935 45.999414  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="m9b8c54d372" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #ff7f0e" />
     </defs>
     <g clip-path="url(#p09feef2583)">
-     <use ns4:href="#m9b8c54d372" x="83.607806" y="163.963846" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="226.799032" y="145.342943" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="369.990258" y="117.045795" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="513.181484" y="117.544365" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="656.37271" y="83.816291" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="83.607806" y="160.220133" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="226.799032" y="131.522812" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="369.990258" y="119.284971" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="513.181484" y="97.052936" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="656.37271" y="99.854174" style="fill: #ff7f0e; stroke: #ff7f0e" />
      <use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
     </g>
    </g>
    <g id="series--xformers-meff" class="series">
-    <path d="M 83.607806 407.071707  L 226.799032 396.194321  L 369.990258 382.362446  L 513.181484 378.056747  L 656.37271 332.261284  L 799.563935 334.228013  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 83.607806 406.681206  L 226.799032 399.095541  L 369.990258 382.16221  L 513.181484 383.640938  L 656.37271 334.388976  L 799.563935 340.779474  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="mc655281e0b" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #2ca02c" />
     </defs>
     <g clip-path="url(#p09feef2583)">
-     <use ns4:href="#mc655281e0b" x="83.607806" y="407.071707" style="fill: #2ca02c; stroke: #2ca02c" />
-     <use ns4:href="#mc655281e0b" x="226.799032" y="396.194321" style="fill: #2ca02c; stroke: #2ca02c" />
-     <use ns4:href="#mc655281e0b" x="369.990258" y="382.362446" style="fill: #2ca02c; stroke: #2ca02c" />
-     <use ns4:href="#mc655281e0b" x="513.181484" y="378.056747" style="fill: #2ca02c; stroke: #2ca02c" />
-     <use ns4:href="#mc655281e0b" x="656.37271" y="332.261284" style="fill: #2ca02c; stroke: #2ca02c" />
-     <use ns4:href="#mc655281e0b" x="799.563935" y="334.228013" style="fill: #2ca02c; stroke: #2ca02c" />
+     <use ns4:href="#mc655281e0b" x="83.607806" y="406.681206" style="fill: #2ca02c; stroke: #2ca02c" />
+     <use ns4:href="#mc655281e0b" x="226.799032" y="399.095541" style="fill: #2ca02c; stroke: #2ca02c" />
+     <use ns4:href="#mc655281e0b" x="369.990258" y="382.16221" style="fill: #2ca02c; stroke: #2ca02c" />
+     <use ns4:href="#mc655281e0b" x="513.181484" y="383.640938" style="fill: #2ca02c; stroke: #2ca02c" />
+     <use ns4:href="#mc655281e0b" x="656.37271" y="334.388976" style="fill: #2ca02c; stroke: #2ca02c" />
+     <use ns4:href="#mc655281e0b" x="799.563935" y="340.779474" style="fill: #2ca02c; stroke: #2ca02c" />
     </g>
    </g>
    <g id="series--hf-kernels-flash-attn" class="series">
-    <path d="M 83.607806 418.848923  L 226.799032 406.104464  L 369.990258 393.547884  L 513.181484 387.046249  L 656.37271 340.26625  L 799.563935 333.615718  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 83.607806 420.013439  L 226.799032 405.003813  L 369.990258 391.079337  L 513.181484 388.024281  L 656.37271 340.106668  L 799.563935 341.194996  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="m61c8040d7e" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #d62728" />
     </defs>
     <g clip-path="url(#p09feef2583)">
-     <use ns4:href="#m61c8040d7e" x="83.607806" y="418.848923" style="fill: #d62728; stroke: #d62728" />
-     <use ns4:href="#m61c8040d7e" x="226.799032" y="406.104464" style="fill: #d62728; stroke: #d62728" />
-     <use ns4:href="#m61c8040d7e" x="369.990258" y="393.547884" style="fill: #d62728; stroke: #d62728" />
-     <use ns4:href="#m61c8040d7e" x="513.181484" y="387.046249" style="fill: #d62728; stroke: #d62728" />
-     <use ns4:href="#m61c8040d7e" x="656.37271" y="340.26625" style="fill: #d62728; stroke: #d62728" />
-     <use ns4:href="#m61c8040d7e" x="799.563935" y="333.615718" style="fill: #d62728; stroke: #d62728" />
+     <use ns4:href="#m61c8040d7e" x="83.607806" y="420.013439" style="fill: #d62728; stroke: #d62728" />
+     <use ns4:href="#m61c8040d7e" x="226.799032" y="405.003813" style="fill: #d62728; stroke: #d62728" />
+     <use ns4:href="#m61c8040d7e" x="369.990258" y="391.079337" style="fill: #d62728; stroke: #d62728" />
+     <use ns4:href="#m61c8040d7e" x="513.181484" y="388.024281" style="fill: #d62728; stroke: #d62728" />
+     <use ns4:href="#m61c8040d7e" x="656.37271" y="340.106668" style="fill: #d62728; stroke: #d62728" />
+     <use ns4:href="#m61c8040d7e" x="799.563935" y="341.194996" style="fill: #d62728; stroke: #d62728" />
     </g>
    </g>
    <g id="series--hf-kernels-flash-attn3" class="series">
-    <path d="M 83.607806 428.387702  L 226.799032 417.179109  L 369.990258 396.852047  L 513.181484 396.728943  L 656.37271 348.383475  L 799.563935 348.523872  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 83.607806 428.387702  L 226.799032 418.228917  L 369.990258 402.378716  L 513.181484 397.605262  L 656.37271 348.593258  L 799.563935 355.437105  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="m7cd35be9cc" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #9467bd" />
     </defs>
     <g clip-path="url(#p09feef2583)">
      <use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
-     <use ns4:href="#m7cd35be9cc" x="226.799032" y="417.179109" style="fill: #9467bd; stroke: #9467bd" />
-     <use ns4:href="#m7cd35be9cc" x="369.990258" y="396.852047" style="fill: #9467bd; stroke: #9467bd" />
-     <use ns4:href="#m7cd35be9cc" x="513.181484" y="396.728943" style="fill: #9467bd; stroke: #9467bd" />
-     <use ns4:href="#m7cd35be9cc" x="656.37271" y="348.383475" style="fill: #9467bd; stroke: #9467bd" />
-     <use ns4:href="#m7cd35be9cc" x="799.563935" y="348.523872" style="fill: #9467bd; stroke: #9467bd" />
+     <use ns4:href="#m7cd35be9cc" x="226.799032" y="418.228917" style="fill: #9467bd; stroke: #9467bd" />
+     <use ns4:href="#m7cd35be9cc" x="369.990258" y="402.378716" style="fill: #9467bd; stroke: #9467bd" />
+     <use ns4:href="#m7cd35be9cc" x="513.181484" y="397.605262" style="fill: #9467bd; stroke: #9467bd" />
+     <use ns4:href="#m7cd35be9cc" x="656.37271" y="348.593258" style="fill: #9467bd; stroke: #9467bd" />
+     <use ns4:href="#m7cd35be9cc" x="799.563935" y="355.437105" style="fill: #9467bd; stroke: #9467bd" />
     </g>
    </g>
    <g id="patch_3">
@@ -4465,7 +4465,7 @@ body[data-tool="eraser"] .main-content {
 <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
 </span> | 
-Cell: combine | 4.26s
+Cell: combine | 4.31s
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
 <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -4572,47 +4572,47 @@ Summary: 6 found, 0 skipped, 0 missing
 COMBINED BENCHMARK SUMMARY
 
 impl                     wl                  p50(ms)  ok
-hf_kernels_flash_attn    cuda_attn_L128_bfloat16     0.94  True
-hf_kernels_flash_attn    cuda_attn_L256_bfloat16     0.99  True
-hf_kernels_flash_attn    cuda_attn_L320_bfloat16     1.03  True
-hf_kernels_flash_attn    cuda_attn_L384_bfloat16     1.05  True
-hf_kernels_flash_attn    cuda_attn_L448_bfloat16     1.21  True
+hf_kernels_flash_attn    cuda_attn_L128_bfloat16     0.95  True
+hf_kernels_flash_attn    cuda_attn_L256_bfloat16     1.00  True
+hf_kernels_flash_attn    cuda_attn_L320_bfloat16     1.05  True
+hf_kernels_flash_attn    cuda_attn_L384_bfloat16     1.06  True
+hf_kernels_flash_attn    cuda_attn_L448_bfloat16     1.23  True
 hf_kernels_flash_attn    cuda_attn_L512_bfloat16     1.23  True
-hf_kernels_flash_attn3   cuda_attn_L128_bfloat16     0.91  True
-hf_kernels_flash_attn3   cuda_attn_L256_bfloat16     0.95  True
-hf_kernels_flash_attn3   cuda_attn_L320_bfloat16     1.02  True
-hf_kernels_flash_attn3   cuda_attn_L384_bfloat16     1.02  True
-hf_kernels_flash_attn3   cuda_attn_L448_bfloat16     1.18  True
+hf_kernels_flash_attn3   cuda_attn_L128_bfloat16     0.92  True
+hf_kernels_flash_attn3   cuda_attn_L256_bfloat16     0.96  True
+hf_kernels_flash_attn3   cuda_attn_L320_bfloat16     1.01  True
+hf_kernels_flash_attn3   cuda_attn_L384_bfloat16     1.03  True
+hf_kernels_flash_attn3   cuda_attn_L448_bfloat16     1.20  True
 hf_kernels_flash_attn3   cuda_attn_L512_bfloat16     1.18  True
 sage_int8_fp16           cuda_attn_L128_bfloat16    FAIL  False
-  Error: module &#x27;sage_attention_cb34d81dafacbad9&#x27; has no attribute &#x27;fwd&#x27;
+  Error: module &#x27;sage_attention_ef0573391bb63704&#x27; has no attribute &#x27;fwd&#x27;
 sage_int8_fp16           cuda_attn_L256_bfloat16    FAIL  False
-  Error: module &#x27;sage_attention_cb34d81dafacbad9&#x27; has no attribute &#x27;fwd&#x27;
+  Error: module &#x27;sage_attention_ef0573391bb63704&#x27; has no attribute &#x27;fwd&#x27;
 sage_int8_fp16           cuda_attn_L320_bfloat16    FAIL  False
-  Error: module &#x27;sage_attention_cb34d81dafacbad9&#x27; has no attribute &#x27;fwd&#x27;
+  Error: module &#x27;sage_attention_ef0573391bb63704&#x27; has no attribute &#x27;fwd&#x27;
 sage_int8_fp16           cuda_attn_L384_bfloat16    FAIL  False
-  Error: module &#x27;sage_attention_cb34d81dafacbad9&#x27; has no attribute &#x27;fwd&#x27;
+  Error: module &#x27;sage_attention_ef0573391bb63704&#x27; has no attribute &#x27;fwd&#x27;
 sage_int8_fp16           cuda_attn_L448_bfloat16    FAIL  False
-  Error: module &#x27;sage_attention_cb34d81dafacbad9&#x27; has no attribute &#x27;fwd&#x27;
+  Error: module &#x27;sage_attention_ef0573391bb63704&#x27; has no attribute &#x27;fwd&#x27;
 sage_int8_fp16           cuda_attn_L512_bfloat16    FAIL  False
-  Error: module &#x27;sage_attention_cb34d81dafacbad9&#x27; has no attribute &#x27;fwd&#x27;
+  Error: module &#x27;sage_attention_ef0573391bb63704&#x27; has no attribute &#x27;fwd&#x27;
 torch_flash_ma           cuda_attn_L128_bfloat16     1.22  True
-torch_flash_ma           cuda_attn_L256_bfloat16     1.27  True
-torch_flash_ma           cuda_attn_L320_bfloat16     1.29  True
-torch_flash_ma           cuda_attn_L384_bfloat16     1.30  True
-torch_flash_ma           cuda_attn_L448_bfloat16     1.45  True
-torch_flash_ma           cuda_attn_L512_bfloat16     1.49  True
-torch_mem_eff            cuda_attn_L128_bfloat16     1.81  True
-torch_mem_eff            cuda_attn_L256_bfloat16     1.88  True
-torch_mem_eff            cuda_attn_L320_bfloat16     1.97  True
-torch_mem_eff            cuda_attn_L384_bfloat16     1.97  True
-torch_mem_eff            cuda_attn_L448_bfloat16     2.09  True
-torch_mem_eff            cuda_attn_L512_bfloat16     2.22  True
-xformers_meff            cuda_attn_L128_bfloat16     0.98  True
-xformers_meff            cuda_attn_L256_bfloat16     1.02  True
-xformers_meff            cuda_attn_L320_bfloat16     1.07  True
+torch_flash_ma           cuda_attn_L256_bfloat16     1.28  True
+torch_flash_ma           cuda_attn_L320_bfloat16     1.30  True
+torch_flash_ma           cuda_attn_L384_bfloat16     1.33  True
+torch_flash_ma           cuda_attn_L448_bfloat16     1.50  True
+torch_flash_ma           cuda_attn_L512_bfloat16     1.51  True
+torch_mem_eff            cuda_attn_L128_bfloat16     1.85  True
+torch_mem_eff            cuda_attn_L256_bfloat16     1.95  True
+torch_mem_eff            cuda_attn_L320_bfloat16     1.99  True
+torch_mem_eff            cuda_attn_L384_bfloat16     2.07  True
+torch_mem_eff            cuda_attn_L448_bfloat16     2.06  True
+torch_mem_eff            cuda_attn_L512_bfloat16     2.25  True
+xformers_meff            cuda_attn_L128_bfloat16     1.00  True
+xformers_meff            cuda_attn_L256_bfloat16     1.03  True
+xformers_meff            cuda_attn_L320_bfloat16     1.08  True
 xformers_meff            cuda_attn_L384_bfloat16     1.08  True
-xformers_meff            cuda_attn_L448_bfloat16     1.24  True
+xformers_meff            cuda_attn_L448_bfloat16     1.25  True
 xformers_meff            cuda_attn_L512_bfloat16     1.23  True
 
 GENERATING COMBINED VISUALIZATION
@@ -4637,7 +4637,7 @@ Implementations included:
 <div class="uv-install-logs" id="uv-logs-combine">
 <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 <div class="uv-logs-content" style="display: none;">
-Installed 37 packages in 190ms
+Installed 37 packages in 225ms
 </div>
 </div>
 <div class="cell-artifacts">
@@ -4650,7 +4650,7 @@ Installed 37 packages in 190ms
   <rdf:RDF>
    <ns2:Work>
     <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
-    <dc:date>2025-10-30T15:53:53.940454</dc:date>
+    <dc:date>2025-10-31T20:14:18.946177</dc:date>
     <dc:format>image/svg+xml</dc:format>
     <dc:creator>
      <ns2:Agent>
@@ -4760,96 +4760,96 @@ Installed 37 packages in 190ms
    <g id="matplotlib.axis_2">
     <g id="ytick_1">
      <g id="grid-y--2" class="grid grid-y">
-      <path d="M 47.81 402.410473  L 835.361742 402.410473  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.81 406.365305  L 835.361742 406.365305  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_7">
       <defs>
        <path id="m0fca2865ba" d="M 0 0  L -3.5 0  " style="stroke: #000000; stroke-width: 0.8" />
       </defs>
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.81" y="402.410473" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.81" y="406.365305" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_7">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="406.209692" transform="rotate(-0 40.81 406.209692)">1.0</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="410.164524" transform="rotate(-0 40.81 410.164524)">1.0</text>
      </g>
     </g>
     <g id="ytick_2">
      <g id="grid-y--3" class="grid grid-y">
-      <path d="M 47.81 343.789654  L 835.361742 343.789654  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.81 348.61376  L 835.361742 348.61376  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_8">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.81" y="343.789654" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.81" y="348.61376" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_8">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="347.588873" transform="rotate(-0 40.81 347.588873)">1.2</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="352.412978" transform="rotate(-0 40.81 352.412978)">1.2</text>
      </g>
     </g>
     <g id="ytick_3">
      <g id="grid-y--4" class="grid grid-y">
-      <path d="M 47.81 285.168836  L 835.361742 285.168836  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.81 290.862214  L 835.361742 290.862214  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_9">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.81" y="285.168836" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.81" y="290.862214" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_9">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="288.968055" transform="rotate(-0 40.81 288.968055)">1.4</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="294.661433" transform="rotate(-0 40.81 294.661433)">1.4</text>
      </g>
     </g>
     <g id="ytick_4">
      <g id="grid-y--5" class="grid grid-y">
-      <path d="M 47.81 226.548018  L 835.361742 226.548018  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.81 233.110668  L 835.361742 233.110668  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_10">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.81" y="226.548018" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.81" y="233.110668" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_10">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="230.347236" transform="rotate(-0 40.81 230.347236)">1.6</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="236.909887" transform="rotate(-0 40.81 236.909887)">1.6</text>
      </g>
     </g>
     <g id="ytick_5">
      <g id="grid-y--6" class="grid grid-y">
-      <path d="M 47.81 167.927199  L 835.361742 167.927199  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.81 175.359123  L 835.361742 175.359123  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_11">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.81" y="167.927199" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.81" y="175.359123" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_11">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="171.726418" transform="rotate(-0 40.81 171.726418)">1.8</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="179.158342" transform="rotate(-0 40.81 179.158342)">1.8</text>
      </g>
     </g>
     <g id="ytick_6">
      <g id="grid-y--7" class="grid grid-y">
-      <path d="M 47.81 109.306381  L 835.361742 109.306381  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.81 117.607577  L 835.361742 117.607577  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_12">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.81" y="109.306381" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.81" y="117.607577" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_12">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="113.1056" transform="rotate(-0 40.81 113.1056)">2.0</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="121.406796" transform="rotate(-0 40.81 121.406796)">2.0</text>
      </g>
     </g>
     <g id="ytick_7">
      <g id="grid-y--8" class="grid grid-y">
-      <path d="M 47.81 50.685563  L 835.361742 50.685563  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.81 59.856031  L 835.361742 59.856031  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_13">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.81" y="50.685563" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.81" y="59.856031" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_13">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="54.484781" transform="rotate(-0 40.81 54.484781)">2.2</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="63.65525" transform="rotate(-0 40.81 63.65525)">2.2</text>
      </g>
     </g>
     <g id="label--y" class="ylabel">
@@ -4857,73 +4857,73 @@ Installed 37 packages in 190ms
     </g>
    </g>
    <g id="series--torch-flash-ma" class="series">
-    <path d="M 83.607806 338.320039  L 226.799032 324.329888  L 369.990258 318.590616  L 513.181484 313.901244  L 656.37271 271.916135  L 799.563935 259.376848  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 83.607806 344.244567  L 226.799032 326.470951  L 369.990258 319.632879  L 513.181484 311.200865  L 656.37271 263.410306  L 799.563935 258.605377  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="md7efaf3aec" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #1f77b4" />
     </defs>
     <g clip-path="url(#p09feef2583)">
-     <use ns4:href="#md7efaf3aec" x="83.607806" y="338.320039" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="226.799032" y="324.329888" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="369.990258" y="318.590616" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="513.181484" y="313.901244" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="656.37271" y="271.916135" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="799.563935" y="259.376848" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="83.607806" y="344.244567" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="226.799032" y="326.470951" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="369.990258" y="319.632879" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="513.181484" y="311.200865" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="656.37271" y="263.410306" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="799.563935" y="258.605377" style="fill: #1f77b4; stroke: #1f77b4" />
     </g>
    </g>
    <g id="series--torch-mem-eff" class="series">
-    <path d="M 83.607806 163.963846  L 226.799032 145.342943  L 369.990258 117.045795  L 513.181484 117.544365  L 656.37271 83.816291  L 799.563935 45.999414  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 83.607806 160.220133  L 226.799032 131.522812  L 369.990258 119.284971  L 513.181484 97.052936  L 656.37271 99.854174  L 799.563935 45.999414  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="m9b8c54d372" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #ff7f0e" />
     </defs>
     <g clip-path="url(#p09feef2583)">
-     <use ns4:href="#m9b8c54d372" x="83.607806" y="163.963846" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="226.799032" y="145.342943" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="369.990258" y="117.045795" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="513.181484" y="117.544365" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="656.37271" y="83.816291" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="83.607806" y="160.220133" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="226.799032" y="131.522812" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="369.990258" y="119.284971" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="513.181484" y="97.052936" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="656.37271" y="99.854174" style="fill: #ff7f0e; stroke: #ff7f0e" />
      <use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
     </g>
    </g>
    <g id="series--xformers-meff" class="series">
-    <path d="M 83.607806 407.071707  L 226.799032 396.194321  L 369.990258 382.362446  L 513.181484 378.056747  L 656.37271 332.261284  L 799.563935 334.228013  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 83.607806 406.681206  L 226.799032 399.095541  L 369.990258 382.16221  L 513.181484 383.640938  L 656.37271 334.388976  L 799.563935 340.779474  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="mc655281e0b" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #2ca02c" />
     </defs>
     <g clip-path="url(#p09feef2583)">
-     <use ns4:href="#mc655281e0b" x="83.607806" y="407.071707" style="fill: #2ca02c; stroke: #2ca02c" />
-     <use ns4:href="#mc655281e0b" x="226.799032" y="396.194321" style="fill: #2ca02c; stroke: #2ca02c" />
-     <use ns4:href="#mc655281e0b" x="369.990258" y="382.362446" style="fill: #2ca02c; stroke: #2ca02c" />
-     <use ns4:href="#mc655281e0b" x="513.181484" y="378.056747" style="fill: #2ca02c; stroke: #2ca02c" />
-     <use ns4:href="#mc655281e0b" x="656.37271" y="332.261284" style="fill: #2ca02c; stroke: #2ca02c" />
-     <use ns4:href="#mc655281e0b" x="799.563935" y="334.228013" style="fill: #2ca02c; stroke: #2ca02c" />
+     <use ns4:href="#mc655281e0b" x="83.607806" y="406.681206" style="fill: #2ca02c; stroke: #2ca02c" />
+     <use ns4:href="#mc655281e0b" x="226.799032" y="399.095541" style="fill: #2ca02c; stroke: #2ca02c" />
+     <use ns4:href="#mc655281e0b" x="369.990258" y="382.16221" style="fill: #2ca02c; stroke: #2ca02c" />
+     <use ns4:href="#mc655281e0b" x="513.181484" y="383.640938" style="fill: #2ca02c; stroke: #2ca02c" />
+     <use ns4:href="#mc655281e0b" x="656.37271" y="334.388976" style="fill: #2ca02c; stroke: #2ca02c" />
+     <use ns4:href="#mc655281e0b" x="799.563935" y="340.779474" style="fill: #2ca02c; stroke: #2ca02c" />
     </g>
    </g>
    <g id="series--hf-kernels-flash-attn" class="series">
-    <path d="M 83.607806 418.848923  L 226.799032 406.104464  L 369.990258 393.547884  L 513.181484 387.046249  L 656.37271 340.26625  L 799.563935 333.615718  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 83.607806 420.013439  L 226.799032 405.003813  L 369.990258 391.079337  L 513.181484 388.024281  L 656.37271 340.106668  L 799.563935 341.194996  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="m61c8040d7e" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #d62728" />
     </defs>
     <g clip-path="url(#p09feef2583)">
-     <use ns4:href="#m61c8040d7e" x="83.607806" y="418.848923" style="fill: #d62728; stroke: #d62728" />
-     <use ns4:href="#m61c8040d7e" x="226.799032" y="406.104464" style="fill: #d62728; stroke: #d62728" />
-     <use ns4:href="#m61c8040d7e" x="369.990258" y="393.547884" style="fill: #d62728; stroke: #d62728" />
-     <use ns4:href="#m61c8040d7e" x="513.181484" y="387.046249" style="fill: #d62728; stroke: #d62728" />
-     <use ns4:href="#m61c8040d7e" x="656.37271" y="340.26625" style="fill: #d62728; stroke: #d62728" />
-     <use ns4:href="#m61c8040d7e" x="799.563935" y="333.615718" style="fill: #d62728; stroke: #d62728" />
+     <use ns4:href="#m61c8040d7e" x="83.607806" y="420.013439" style="fill: #d62728; stroke: #d62728" />
+     <use ns4:href="#m61c8040d7e" x="226.799032" y="405.003813" style="fill: #d62728; stroke: #d62728" />
+     <use ns4:href="#m61c8040d7e" x="369.990258" y="391.079337" style="fill: #d62728; stroke: #d62728" />
+     <use ns4:href="#m61c8040d7e" x="513.181484" y="388.024281" style="fill: #d62728; stroke: #d62728" />
+     <use ns4:href="#m61c8040d7e" x="656.37271" y="340.106668" style="fill: #d62728; stroke: #d62728" />
+     <use ns4:href="#m61c8040d7e" x="799.563935" y="341.194996" style="fill: #d62728; stroke: #d62728" />
     </g>
    </g>
    <g id="series--hf-kernels-flash-attn3" class="series">
-    <path d="M 83.607806 428.387702  L 226.799032 417.179109  L 369.990258 396.852047  L 513.181484 396.728943  L 656.37271 348.383475  L 799.563935 348.523872  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 83.607806 428.387702  L 226.799032 418.228917  L 369.990258 402.378716  L 513.181484 397.605262  L 656.37271 348.593258  L 799.563935 355.437105  " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="m7cd35be9cc" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #9467bd" />
     </defs>
     <g clip-path="url(#p09feef2583)">
      <use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
-     <use ns4:href="#m7cd35be9cc" x="226.799032" y="417.179109" style="fill: #9467bd; stroke: #9467bd" />
-     <use ns4:href="#m7cd35be9cc" x="369.990258" y="396.852047" style="fill: #9467bd; stroke: #9467bd" />
-     <use ns4:href="#m7cd35be9cc" x="513.181484" y="396.728943" style="fill: #9467bd; stroke: #9467bd" />
-     <use ns4:href="#m7cd35be9cc" x="656.37271" y="348.383475" style="fill: #9467bd; stroke: #9467bd" />
-     <use ns4:href="#m7cd35be9cc" x="799.563935" y="348.523872" style="fill: #9467bd; stroke: #9467bd" />
+     <use ns4:href="#m7cd35be9cc" x="226.799032" y="418.228917" style="fill: #9467bd; stroke: #9467bd" />
+     <use ns4:href="#m7cd35be9cc" x="369.990258" y="402.378716" style="fill: #9467bd; stroke: #9467bd" />
+     <use ns4:href="#m7cd35be9cc" x="513.181484" y="397.605262" style="fill: #9467bd; stroke: #9467bd" />
+     <use ns4:href="#m7cd35be9cc" x="656.37271" y="348.593258" style="fill: #9467bd; stroke: #9467bd" />
+     <use ns4:href="#m7cd35be9cc" x="799.563935" y="355.437105" style="fill: #9467bd; stroke: #9467bd" />
     </g>
    </g>
    <g id="patch_3">
diff --git a/index.html b/index.html
index 6d43c0f1f0a8b42c583cb3e0d6a059916ac01ccc..11cdf1eef85f4dda68d9e978af612e8aae0078bb 100644
--- a/index.html
+++ b/index.html
@@ -4097,35 +4097,54 @@ body[data-tool="eraser"] .main-content {
     </div>
     
     <div class="main-content">
-        <h1>KERNELS COMMUNITY BENCHMARKS</h1>
+        <div class="linkbar">
+<a target="_blank" href="https://github.com/huggingface/kernels">Python Library</a> |
+<a target="_blank" href="https://github.com/huggingface/kernel-builder">Builder</a> |
+<a target="_blank" href="https://github.com/huggingface/kernels-community">Community</a> |
+<a target="_blank" href="https://huggingface.co/kernels-community">Community Hub</a> |
+<a target="_blank" href="https://github.com/huggingface/kernels-benchmarks">Benchmarks</a>
+</div>
+
+<p><br/></p>
+<h1>KERNELS COMMUNITY BENCHMARKS</h1>
 <p>This report aggregates latency and performance benchmarks across core model components.<br />
 Each section includes:<br />
 - A latency visualization<br />
 - Links to detailed implementation benchmarks  </p>
 <h2>TABLE OF CONTENTS</h2>
 <ul>
-<li><a href="#methodology">METHODOLOGY</a></li>
-<li><a href="#layer-normalization">LAYER NORMALIZATION</a></li>
-<li><a href="#rotary-position-embeddings">ROTARY POSITION EMBEDDINGS</a></li>
+<li><a href="#activation-functions">ACTIVATION FUNCTIONS</a></li>
 <li><a href="#flash-attention">FLASH ATTENTION</a></li>
+<li><a href="#deformable-detr">DEFORMABLE DETR</a></li>
+<li><a href="#openai-style-moe">OPENAI-STYLE MOE</a></li>
+<li><a href="#rotary-position-embeddings">ROTARY POSITION EMBEDDINGS</a></li>
 <li><a href="#causal-conv1d">CAUSAL CONV1D</a></li>
-<li><a href="#activation-functions">ACTIVATION FUNCTIONS</a></li>
-<li><a href="#notes">NOTES</a></li>
+<li><a href="#layer-normaliz=ation">LAYER NORMALIZATION</a></li>
 </ul>
+<h2>RUN YOURSELF</h2>
+<p>To run the benchmarks locally, clone the repository and use <code>uvx</code> to build and run the benchmarks:</p>
+<p>Note benches are made to run on a machine with a compatible NVIDIA GPU and CUDA installed, other hardware may not not work as expected.</p>
+<div class="codehilite"><pre><span></span><code>git<span class="w"> </span>clone<span class="w"> </span>https://github.com/huggingface/kernels-benchmarks.git
+<span class="nb">cd</span><span class="w"> </span>kernels-benchmarks
+uvx<span class="w"> </span>https://github.com/drbh/uvnote.git<span class="w"> </span>build<span class="w"> </span>benches
+</code></pre></div>
+
 <h2>METHODOLOGY</h2>
-<p>Each benchmark is run with the <a href="https://github.com/huggingface/kernels-benchmarks">Kernels Benchmarking Framework</a> and follows these principles:<br />
+<p>Each benchmark is run with the
+<a target="_blank" href="https://github.com/huggingface/kernels-benchmarks">Kernels Benchmarking Framework</a> and follows these principles:<br />
 - a reference implementation (usually PyTorch native) is included for baseline comparison<br />
 - multiple input sizes and batch sizes are tested to reflect real-world usage<br />
 - runs are repeatable via python virtual environments and documented dependencies<br />
 - results are collected and visualized using standardized scripts  </p>
-<hr />
+<p><br/></p>
+<h2>BENCHMARKS</h2>
 <div class="alert">
   <strong>Note:</strong> Latency values are measured in milliseconds (ms). Lower values indicate better performance.
 </div>
 
-<h2>LAYER NORMALIZATION</h2>
+<h2>ACTIVATION FUNCTIONS</h2>
 <div class="artifact-preview">
-  <img src="layer_norm/results/artifacts/combine/latency.svg" alt="Layer Norm Latency" width="800">
+  <img src="activation/results/artifacts/combine/latency.svg" alt="Activation Latency" width="800">
 </div>
 
 <table>
@@ -4133,32 +4152,40 @@ Each section includes:<br />
 <tr>
 <th>Implementation</th>
 <th>Description</th>
+<th>Source</th>
+<th>HF</th>
+<th>Bench</th>
 </tr>
 </thead>
 <tbody>
 <tr>
-<td>HF Kernels Layer Norm</td>
-<td>HuggingFace kernels implementation</td>
+<td>HF Kernels SwiGLU</td>
+<td>HuggingFace kernels SwiGLU implementation</td>
+<td><a href="https://github.com/huggingface/kernels-community/tree/main/activation">GitHub</a></td>
+<td><a href="https://huggingface.co/kernels-community/activation">HF</a></td>
+<td><a href="activation/impls/hf_kernels_swiglu.html">Bench</a></td>
 </tr>
 <tr>
-<td>PyTorch Layer Norm</td>
-<td>PyTorch native implementation</td>
+<td>PyTorch SwiGLU</td>
+<td>PyTorch native SwiGLU implementation</td>
+<td>-</td>
+<td>-</td>
+<td><a href="activation/impls/torch_swiglu.html">Bench</a></td>
 </tr>
 </tbody>
 </table>
 <p align="center">
-  <!-- <button onclick="window.location.href='layer_norm/'" style="margin-left: 20px; padding: 10px 20px; background-color: #007bff; color: white; border: none; border-radius: 5px; cursor: pointer;"> -->
-  <button 
-    onclick="window.location.href='layer_norm/'"
+  <button
+    onclick="window.location.href='/#/activation/'"
     class="btn">
     Explore Full Bench
   </button>
 </p>
 
 <hr />
-<h2>ROTARY POSITION EMBEDDINGS</h2>
+<h2>FLASH ATTENTION</h2>
 <div class="artifact-preview">
-  <img src="rotary/results/artifacts/combine/latency.svg" alt="Rotary Position Embeddings Latency" width="800">
+  <img src="flash_attn/results/artifacts/combine/latency.svg" alt="Flash Attention Latency" width="800">
 </div>
 
 <table>
@@ -4166,31 +4193,68 @@ Each section includes:<br />
 <tr>
 <th>Implementation</th>
 <th>Description</th>
+<th>Source</th>
+<th>HF</th>
+<th>Bench</th>
 </tr>
 </thead>
 <tbody>
 <tr>
-<td>HF Kernels Rotary</td>
-<td>HuggingFace kernels implementation</td>
+<td>Flash Attention</td>
+<td>Torch SDPA Flash Attention implementation</td>
+<td>-</td>
+<td>-</td>
+<td><a href="flash_attn/impls/flash_attention.html">Bench</a></td>
 </tr>
 <tr>
-<td>PyTorch Rotary</td>
-<td>PyTorch native implementation</td>
+<td>HF Kernels Flash Attention 2</td>
+<td>HuggingFace kernels Flash Attention</td>
+<td><a href="https://github.com/huggingface/kernels-community/tree/main/flash-attn2">GitHub</a></td>
+<td><a href="https://huggingface.co/kernels-community/flash-attn2">HF</a></td>
+<td><a href="flash_attn/impls/hf_kernels_flash_attn.html">Bench</a></td>
+</tr>
+<tr>
+<td>HF Kernels Flash Attention 3</td>
+<td>HuggingFace kernels Flash Attention 3</td>
+<td><a href="https://github.com/huggingface/kernels-community/tree/main/flash-attn3">GitHub</a></td>
+<td><a href="https://huggingface.co/kernels-community/flash-attn3">HF</a></td>
+<td><a href="flash_attn/impls/hf_kernels_flash_attn3.html">Bench</a></td>
+</tr>
+<tr>
+<td>Memory Efficient Attention</td>
+<td>Memory efficient attention implementation</td>
+<td></td>
+<td>-</td>
+<td><a href="flash_attn/impls/mem_efficient_attention.html">Bench</a></td>
+</tr>
+<tr>
+<td>Sage Attention</td>
+<td>Sage attention implementation</td>
+<td></td>
+<td><a href="https://huggingface.co/kernels-community/sage_attention">HF</a></td>
+<td><a href="flash_attn/impls/sage_attention.html">Bench</a></td>
+</tr>
+<tr>
+<td>xFormers</td>
+<td>xFormers attention implementation</td>
+<td><a href="https://github.com/facebookresearch/xformers">GitHub</a></td>
+<td>-</td>
+<td><a href="flash_attn/impls/xformers.html">Bench</a></td>
 </tr>
 </tbody>
 </table>
 <p align="center">
   <button
-    onclick="window.location.href='rotary/'"
+    onclick="window.location.href='flash_attn/'"
     class="btn">
     Explore Full Bench
   </button>
 </p>
 
 <hr />
-<h2>FLASH ATTENTION</h2>
+<h2>DEFORMABLE DETR</h2>
 <div class="artifact-preview">
-  <img src="flash_attn/results/artifacts/combine/latency.svg" alt="Flash Attention Latency" width="800">
+  <img src="deformable_detr/results/artifacts/combine/latency.svg" alt="Deformable DETR Latency" width="800">
 </div>
 
 <table>
@@ -4198,38 +4262,72 @@ Each section includes:<br />
 <tr>
 <th>Implementation</th>
 <th>Description</th>
+<th>Source</th>
+<th>HF</th>
+<th>Bench</th>
 </tr>
 </thead>
 <tbody>
 <tr>
-<td>Flash Attention</td>
-<td>Flash Attention implementation</td>
-</tr>
-<tr>
-<td>HF Kernels Flash Attention</td>
-<td>HuggingFace kernels Flash Attention</td>
+<td>HF Kernels Deformable DETR</td>
+<td>HuggingFace kernels Deformable DETR implementation</td>
+<td><a href="https://github.com/huggingface/kernels-community/tree/main/deformable-detr">GitHub</a></td>
+<td><a href="https://huggingface.co/kernels-community/deformable-detr">HF</a></td>
+<td><a href="deformable_detr/impls/hf_kernels_deformable_detr.html">Bench</a></td>
 </tr>
 <tr>
-<td>HF Kernels Flash Attention 3</td>
-<td>HuggingFace kernels Flash Attention 3</td>
+<td>PyTorch Deformable DETR</td>
+<td>PyTorch native Deformable DETR implementation</td>
+<td>-</td>
+<td>-</td>
+<td><a href="deformable_detr/impls/torch_deformable_detr.html">Bench</a></td>
 </tr>
+</tbody>
+</table>
+<p align="center">
+  <button
+    onclick="window.location.href='deformable_detr/'"
+    class="btn">
+    Explore Full Bench
+  </button>
+</p>
+
+<hr />
+<h2>OPENAI-STYLE MOE</h2>
+<div class="artifact-preview">
+  <img src="openai_moe/results/artifacts/combine/latency.svg" alt="OpenAI MoE Latency" width="800">
+</div>
+
+<table>
+<thead>
 <tr>
-<td>Memory Efficient Attention</td>
-<td>Memory efficient attention implementation</td>
+<th>Implementation</th>
+<th>Description</th>
+<th>Source</th>
+<th>HF</th>
+<th>Bench</th>
 </tr>
+</thead>
+<tbody>
 <tr>
-<td>Sage Attention</td>
-<td>Sage attention implementation</td>
+<td>GptOssExperts</td>
+<td>GPT OSS reference OpenAI-style MoE</td>
+<td></td>
+<td></td>
+<td><a href="openai_moe/impls/gpt_oss_moe.html">Bench</a></td>
 </tr>
 <tr>
-<td>xFormers</td>
-<td>xFormers attention implementation</td>
+<td>Binned PyTorch</td>
+<td>Binned PyTorch OpenAI-style MoE implementation</td>
+<td>-</td>
+<td>-</td>
+<td><a href="openai_moe/impls/binned_torch.html">Bench</a></td>
 </tr>
 </tbody>
 </table>
 <p align="center">
   <button
-    onclick="window.location.href='flash_attn/'"
+    onclick="window.location.href='openai_moe/'"
     class="btn">
     Explore Full Bench
   </button>
@@ -4246,16 +4344,25 @@ Each section includes:<br />
 <tr>
 <th>Implementation</th>
 <th>Description</th>
+<th>Source</th>
+<th>HF</th>
+<th>Bench</th>
 </tr>
 </thead>
 <tbody>
 <tr>
 <td>HF Kernels Causal Conv1D</td>
 <td>HuggingFace kernels implementation</td>
+<td><a href="https://github.com/huggingface/kernels-community/tree/main/causal-conv1d">GitHub</a></td>
+<td><a href="https://huggingface.co/kernels-community/causal-conv1d">HF</a></td>
+<td><a href="causal_conv1d/impls/hf_kernels_causal_conv1d.html">Bench</a></td>
 </tr>
 <tr>
 <td>PyTorch Causal Conv1D</td>
 <td>PyTorch native implementation</td>
+<td>-</td>
+<td>-</td>
+<td><a href="causal_conv1d/impls/torch_causal_conv1d.html">Bench</a></td>
 </tr>
 </tbody>
 </table>
@@ -4268,9 +4375,9 @@ Each section includes:<br />
 </p>
 
 <hr />
-<h2>ACTIVATION FUNCTIONS</h2>
+<h2>ROTARY POSITION EMBEDDINGS</h2>
 <div class="artifact-preview">
-  <img src="activation/results/artifacts/combine/latency.svg" alt="Activation Latency" width="800">
+  <img src="rotary/results/artifacts/combine/latency.svg" alt="Rotary Position Embeddings Latency" width="800">
 </div>
 
 <table>
@@ -4278,28 +4385,77 @@ Each section includes:<br />
 <tr>
 <th>Implementation</th>
 <th>Description</th>
+<th>Source</th>
+<th>HF</th>
+<th>Bench</th>
 </tr>
 </thead>
 <tbody>
 <tr>
-<td>HF Kernels SwiGLU</td>
-<td>HuggingFace kernels SwiGLU implementation</td>
+<td>HF Kernels Rotary</td>
+<td>HuggingFace kernels implementation</td>
+<td><a href="https://github.com/huggingface/kernels-community/tree/main/rotary">GitHub</a></td>
+<td><a href="https://huggingface.co/kernels-community/rotary">HF</a></td>
+<td><a href="rotary/impls/hf_kernels_rotary.html">Bench</a></td>
 </tr>
 <tr>
-<td>PyTorch SwiGLU</td>
-<td>PyTorch native SwiGLU implementation</td>
+<td>PyTorch Rotary</td>
+<td>PyTorch native implementation</td>
+<td>-</td>
+<td>-</td>
+<td><a href="rotary/impls/torch_rotary.html">Bench</a></td>
 </tr>
 </tbody>
 </table>
 <p align="center">
   <button
-    onclick="window.location.href='activation/'"
+    onclick="window.location.href='rotary/'"
     class="btn">
     Explore Full Bench
   </button>
 </p>
 
 <hr />
+<h2>LAYER NORMALIZATION</h2>
+<div class="artifact-preview">
+  <img src="layer_norm/results/artifacts/combine/latency.svg" alt="Layer Norm Latency" width="800">
+</div>
+
+<table>
+<thead>
+<tr>
+<th>Implementation</th>
+<th>Description</th>
+<th>Source</th>
+<th>HF</th>
+<th>Bench</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>HF Kernels Layer Norm</td>
+<td>HuggingFace kernels implementation</td>
+<td><a href="https://github.com/huggingface/kernels-community/tree/main/layer-norm">GitHub</a></td>
+<td><a href="https://huggingface.co/kernels-community/layer-norm">HF</a></td>
+<td><a href="layer_norm/impls/hf_kernels_layer_norm.html">Bench</a></td>
+</tr>
+<tr>
+<td>PyTorch Layer Norm</td>
+<td>PyTorch native implementation</td>
+<td>-</td>
+<td>-</td>
+<td><a href="layer_norm/impls/torch_layer_norm.html">Bench</a></td>
+</tr>
+</tbody>
+</table>
+<p align="center">
+  <button 
+    onclick="window.location.href='layer_norm/'"
+    class="btn">
+    Explore Full Bench
+  </button>
+</p>
+
 <style>
     .controls {
         display: none !important;
@@ -4343,12 +4499,10 @@ Each section includes:<br />
     }
     :root {
         --bg-alert: #0069cbff;
-        --border-alert: #001628ff;
     }
     .alert {
-        padding: 5px;
+        padding: 5px 10px;
         background-color: var(--bg-alert);
-        border-left: 6px solid var(--border-alert);
         margin-bottom: 10px;
         border-radius: 6px;
     }
diff --git a/layer_norm/impls/artifacts/benchmark/layer_norm.jsonl b/layer_norm/impls/artifacts/benchmark/layer_norm.jsonl
index 2054adf984aee08467b8d3188d63a2b34ff7f13a..1c38cebcfad6cb20f64b80a0e6db2e230635d410 100644
--- a/layer_norm/impls/artifacts/benchmark/layer_norm.jsonl
+++ b/layer_norm/impls/artifacts/benchmark/layer_norm.jsonl
@@ -1,4 +1,4 @@
-{"ts": "2025-10-30T15:53:20Z", "run": "43d96fd4ebe14ca496dc9089d5327f41", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D4096", "batch": 16, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8346939999910319, "p50": 0.8380950000059784, "p90": 0.838174000023173, "mean": 0.8376522000048681, "iqr": 0.0016900000332498166, "raw_times": [0.8346939999910319, 0.8364839999899232, 0.8408140000142339, 0.838174000023173, 0.8380950000059784], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8434949999696073, "peak_bytes": 2415935488, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
-{"ts": "2025-10-30T15:53:20Z", "run": "43d96fd4ebe14ca496dc9089d5327f41", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D8192", "batch": 16, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6432289999670502, "p50": 1.649038999971708, "p90": 1.6514490000076876, "mean": 1.6484529999956976, "iqr": 0.006049999967672193, "raw_times": [1.649038999971708, 1.6531489999920268, 1.6453990000400154, 1.6432289999670502, 1.6514490000076876], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.660748999995576, "peak_bytes": 4831870976, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_ref"}, "err": null}
-{"ts": "2025-10-30T15:53:20Z", "run": "43d96fd4ebe14ca496dc9089d5327f41", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D4096", "batch": 16, "seq_len": 4096, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.639337999961299, "p50": 1.64666899996746, "p90": 1.647079000008489, "mean": 1.645640799983994, "iqr": 0.0004199999921183917, "raw_times": [1.64666899996746, 1.6484589999663513, 1.639337999961299, 1.647079000008489, 1.6466590000163706], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6403390000050422, "peak_bytes": 4831854592, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
-{"ts": "2025-10-30T15:53:21Z", "run": "43d96fd4ebe14ca496dc9089d5327f41", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D8192", "batch": 16, "seq_len": 4096, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.2456669999874066, "p50": 3.2605380000063633, "p90": 3.2625569999709114, "mean": 3.2589550000011513, "iqr": 0.014490999944882788, "raw_times": [3.2456669999874066, 3.2625569999709114, 3.2605380000063633, 3.277947000015047, 3.2480660000260286], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.260236999949484, "peak_bytes": 9663709184, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1026859283447266e-05, "ref": "layer_norm_ref"}, "err": null}
+{"ts": "2025-10-31T20:00:11Z", "run": "f2cf664f6646484f88815be637f5bc9d", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D4096", "batch": 16, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.807951000012963, "p50": 0.8174310000299556, "p90": 0.8198709999760467, "mean": 0.8162470000002031, "iqr": 0.0038399999766625115, "raw_times": [0.8160309999993842, 0.8198709999760467, 0.8174310000299556, 0.807951000012963, 0.819950999982666], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8318710000025931, "peak_bytes": 2415935488, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
+{"ts": "2025-10-31T20:00:11Z", "run": "f2cf664f6646484f88815be637f5bc9d", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D8192", "batch": 16, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6729929999996784, "p50": 1.6790130000003956, "p90": 1.685203000022284, "mean": 1.6802827999867986, "iqr": 0.007120000077520672, "raw_times": [1.685203000022284, 1.6790130000003956, 1.6729929999996784, 1.686121999966872, 1.6780829999447633], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6821429999822612, "peak_bytes": 4831870976, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_ref"}, "err": null}
+{"ts": "2025-10-31T20:00:12Z", "run": "f2cf664f6646484f88815be637f5bc9d", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D4096", "batch": 16, "seq_len": 4096, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6009309999844845, "p50": 1.6056009999942944, "p90": 1.611341000000266, "mean": 1.606853000009778, "iqr": 0.008409999963987502, "raw_times": [1.6009309999844845, 1.6056009999942944, 1.613461000033567, 1.6029310000362784, 1.611341000000266], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6386120000220217, "peak_bytes": 4831854592, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
+{"ts": "2025-10-31T20:00:12Z", "run": "f2cf664f6646484f88815be637f5bc9d", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D8192", "batch": 16, "seq_len": 4096, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.3123249999675863, "p50": 3.327974000001177, "p90": 3.3289149999973233, "mean": 3.3240905999946335, "iqr": 0.010180999993281148, "raw_times": [3.3325050000030387, 3.3289149999973233, 3.3123249999675863, 3.318734000004042, 3.327974000001177], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.335275000040383, "peak_bytes": 9663709184, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1026859283447266e-05, "ref": "layer_norm_ref"}, "err": null}
diff --git a/layer_norm/impls/cells/benchmark.py b/layer_norm/impls/cells/benchmark.py
index d871d1b25fedf8b294c567e9ac582decb62f3cde..6a00a9f99d8d044ab5f9dc0f5019344cef0612b9 100644
--- a/layer_norm/impls/cells/benchmark.py
+++ b/layer_norm/impls/cells/benchmark.py
@@ -3,7 +3,6 @@
 # dependencies = [
 #     "numpy",
 #     "torch==2.8.0",
-#     "kernels",
 #     "kernels-benchmark-tools",
 # ]
 #
@@ -13,37 +12,15 @@
 import torch
 import sys
 from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
-from kernels import get_kernel
 
-# Load the layer norm kernel
-layer_norm_kernel = get_kernel("kernels-community/layer-norm")
 
-
-def hf_kernels_layer_norm(x, weight, bias, eps: float = 1e-5):
-    B, S, D = x.shape
-    # The kernel expects [N, D] input; support beta (bias) if provided.
-    out = layer_norm_kernel.dropout_add_ln_fwd(
-        input=x.view(-1, D),
-        gamma=weight,
-        beta=bias,
-        rowscale=None,
-        colscale=None,
-        x0_subset=None,
-        z_subset=None,
-        dropout_p=0.0,
-        epsilon=eps,
-        rowscale_const=1.0,
-        z_numrows=S,
-        gen=None,
-        residual_in_fp32=False,
-        is_rms_norm=False,
-    )[0].view(B, S, D)
-    return out
+def torch_layer_norm(x, weight, bias, eps: float = 1e-5):
+    return torch.nn.functional.layer_norm(x, (x.shape[-1],), weight, bias, eps)
 
 
 run_benchmark(
     kernel_type=KernelTypeEnum.LAYER_NORM,
-    impl_name="hf_kernels_layer_norm",
-    impl_tags={"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"},
-    impl_func=hf_kernels_layer_norm,
+    impl_name="torch_layer_norm",
+    impl_tags={"family": "torch", "op": "layer_norm"},
+    impl_func=torch_layer_norm,
 )
\ No newline at end of file
diff --git a/layer_norm/impls/hf_kernels_layer_norm.html b/layer_norm/impls/hf_kernels_layer_norm.html
index c73029a3a400d5d4189d57862175a6044d2891f7..1f158344e6570f7297be0322ab6a513d52dde712 100644
--- a/layer_norm/impls/hf_kernels_layer_norm.html
+++ b/layer_norm/impls/hf_kernels_layer_norm.html
@@ -4107,11 +4107,12 @@ body[data-tool="eraser"] .main-content {
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
 </span> | 
-Cell: benchmark | 6.10s
+Cell: benchmark | 10.09s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
-<a href="https://github.com/huggingface/kernels-uvnotes/blob/main/layer_norm/impls/hf_kernels_layer_norm.md" target="_blank" class="github-btn">GitHub</a>
+<a href="https://github.com/huggingface/kernels-benchmarks/blob/main/layer_norm/impls/hf_kernels_layer_norm.md" target="_blank" class="github-btn">GitHub</a>
+<a href="https://huggingface.co/kernels-community/layer-norm" target="_blank" class="hf-btn">🤗 HF</a>
 </div>
 <div id="code-benchmark" class="cell-code" data-lines="49">
 <div class="code-wrap">
@@ -4178,19 +4179,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D4096
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm         4.90%     197.042us        46.64%       1.877ms       1.877ms       0.000us         0.00%       3.132ms       3.132ms             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         1.74%      69.952us        41.15%       1.656ms     551.934us       2.385ms       100.00%       3.132ms       1.044ms             3  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       2.386ms       100.06%       2.386ms       2.386ms             1  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us       2.385ms       100.00%       2.385ms     794.945us             3  
-                                Activity Buffer Request        37.01%       1.489ms        37.01%       1.489ms       1.489ms     747.170us        31.33%     747.170us     747.170us             1  
-                                             aten::view         0.59%      23.780us         0.59%      23.780us       3.963us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         1.17%      47.212us         1.17%      47.212us       5.246us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.23%       9.090us         0.23%       9.090us       3.030us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         1.00%      40.411us         1.00%      40.411us      13.470us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        53.36%       2.147ms        53.36%       2.147ms       2.147ms       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm         5.01%     203.177us        46.78%       1.895ms       1.895ms       0.000us         0.00%       3.141ms       3.141ms             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         1.71%      69.312us        41.16%       1.668ms     555.914us       2.399ms       100.00%       3.141ms       1.047ms             3  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       2.401ms       100.06%       2.401ms       2.401ms             1  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us       2.399ms       100.00%       2.399ms     799.825us             3  
+                                Activity Buffer Request        36.95%       1.497ms        36.95%       1.497ms       1.497ms     742.012us        30.92%     742.012us     742.012us             1  
+                                             aten::view         0.61%      24.559us         0.61%      24.559us       4.093us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         1.20%      48.622us         1.20%      48.622us       5.402us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.23%       9.170us         0.23%       9.170us       3.057us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.07%      43.390us         1.07%      43.390us      14.463us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        53.22%       2.156ms        53.22%       2.156ms       2.156ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 4.023ms
-Self CUDA time total: 2.385ms
+Self CPU time total: 4.052ms
+Self CUDA time total: 2.399ms
 
 
 
@@ -4200,19 +4201,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D8192
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm         2.41%     154.482us        27.38%       1.753ms       1.753ms       0.000us         0.00%       6.413ms       6.413ms             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         0.72%      46.409us        24.77%       1.586ms     528.643us       4.824ms       100.00%       6.413ms       2.138ms             3  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       4.826ms       100.03%       4.826ms       4.826ms             1  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us       4.824ms       100.00%       4.824ms       1.608ms             3  
-                                Activity Buffer Request        23.06%       1.476ms        23.06%       1.476ms       1.476ms       1.588ms        32.92%       1.588ms       1.588ms             1  
-                                             aten::view         0.20%      12.531us         0.20%      12.531us       2.089us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         0.47%      30.283us         0.47%      30.283us       3.365us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.08%       5.150us         0.08%       5.150us       1.717us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         0.43%      27.650us         0.43%      27.650us       9.217us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        72.62%       4.650ms        72.62%       4.650ms       4.650ms       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm         1.88%     119.443us        26.75%       1.701ms       1.701ms       0.000us         0.00%       6.407ms       6.407ms             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         0.71%      45.121us        24.67%       1.568ms     522.677us       4.827ms       100.00%       6.407ms       2.136ms             3  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       4.829ms       100.03%       4.829ms       4.829ms             1  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us       4.827ms       100.00%       4.827ms       1.609ms             3  
+                                Activity Buffer Request        22.91%       1.456ms        22.91%       1.456ms       1.456ms       1.580ms        32.72%       1.580ms       1.580ms             1  
+                                             aten::view         0.21%      13.200us         0.21%      13.200us       2.200us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         0.51%      32.711us         0.51%      32.711us       3.635us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.08%       5.289us         0.08%       5.289us       1.763us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         0.45%      28.522us         0.45%      28.522us       9.507us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        73.25%       4.656ms        73.25%       4.656ms       4.656ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 6.403ms
-Self CUDA time total: 4.824ms
+Self CPU time total: 6.357ms
+Self CUDA time total: 4.827ms
 
 
 
@@ -4222,19 +4223,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D4096
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm         2.19%     139.552us        27.69%       1.763ms       1.763ms       0.000us         0.00%       6.329ms       6.329ms             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         0.72%      45.651us        25.31%       1.612ms     537.326us       4.772ms       100.00%       6.329ms       2.110ms             3  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       4.774ms       100.03%       4.774ms       4.774ms             1  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us       4.772ms       100.00%       4.772ms       1.591ms             3  
-                                Activity Buffer Request        23.61%       1.504ms        23.61%       1.504ms       1.504ms       1.557ms        32.63%       1.557ms       1.557ms             1  
-                                             aten::view         0.19%      11.951us         0.19%      11.951us       1.992us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         0.48%      30.520us         0.48%      30.520us       3.391us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.08%       5.131us         0.08%       5.131us       1.710us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         0.42%      26.970us         0.42%      26.970us       8.990us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        72.31%       4.606ms        72.31%       4.606ms       4.606ms       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm         1.89%     118.801us        26.85%       1.686ms       1.686ms       0.000us         0.00%       6.309ms       6.309ms             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         0.78%      49.183us        24.77%       1.555ms     518.493us       4.763ms       100.00%       6.309ms       2.103ms             3  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       4.765ms       100.03%       4.765ms       4.765ms             1  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us       4.763ms       100.00%       4.763ms       1.588ms             3  
+                                Activity Buffer Request        22.96%       1.442ms        22.96%       1.442ms       1.442ms       1.546ms        32.46%       1.546ms       1.546ms             1  
+                                             aten::view         0.19%      11.741us         0.19%      11.741us       1.957us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         0.49%      30.460us         0.49%      30.460us       3.384us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.08%       4.920us         0.08%       4.920us       1.640us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         0.46%      29.050us         0.46%      29.050us       9.683us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        73.15%       4.593ms        73.15%       4.593ms       4.593ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 6.370ms
-Self CUDA time total: 4.772ms
+Self CPU time total: 6.279ms
+Self CUDA time total: 4.763ms
 
 
 
@@ -4244,36 +4245,38 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D8192
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                  hf_kernels_layer_norm         1.25%     143.461us        17.42%       1.995ms       1.995ms       0.000us         0.00%      12.814ms      12.814ms             1  
-                _layer_norm_f8ec252::dropout_add_ln_fwd         0.40%      45.652us        16.06%       1.839ms     613.131us       9.628ms       100.00%      12.814ms       4.271ms             3  
-                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       9.629ms       100.01%       9.629ms       9.629ms             1  
-void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us       9.628ms       100.00%       9.628ms       3.209ms             3  
-                                Activity Buffer Request        12.97%       1.486ms        12.97%       1.486ms       1.486ms       3.186ms        33.09%       3.186ms       3.186ms             1  
-                                             aten::view         0.11%      12.411us         0.11%      12.411us       2.069us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::empty         0.27%      31.101us         0.27%      31.101us       3.456us       0.000us         0.00%       0.000us       0.000us             9  
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.04%       5.010us         0.04%       5.010us       1.670us       0.000us         0.00%       0.000us       0.000us             3  
-                                       cudaLaunchKernel         2.37%     271.915us         2.37%     271.915us      90.638us       0.000us         0.00%       0.000us       0.000us             3  
-                                  cudaDeviceSynchronize        82.58%       9.458ms        82.58%       9.458ms       9.458ms       0.000us         0.00%       0.000us       0.000us             1  
+                                  hf_kernels_layer_norm         1.11%     112.814us         7.31%     743.908us     743.908us       0.000us         0.00%      12.737ms      12.737ms             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         0.47%      47.722us         6.09%     619.105us     206.368us       9.594ms       100.00%      12.737ms       4.246ms             3  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       9.595ms       100.02%       9.595ms       9.595ms             1  
+void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us       9.594ms       100.00%       9.594ms       3.198ms             3  
+                                Activity Buffer Request         2.50%     254.176us         2.50%     254.176us     254.176us       3.143ms        32.76%       3.143ms       3.143ms             1  
+                                             aten::view         0.12%      11.989us         0.12%      11.989us       1.998us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         0.30%      30.280us         0.30%      30.280us       3.364us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.05%       5.000us         0.05%       5.000us       1.667us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         2.77%     281.927us         2.77%     281.927us      93.976us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        92.69%       9.430ms        92.69%       9.430ms       9.430ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 11.453ms
-Self CUDA time total: 9.628ms
+Self CPU time total: 10.174ms
+Self CUDA time total: 9.594ms
 
 
 impl                     wl                  p50(ms)  ok
 hf_kernels_layer_norm    LN_B16_S2048_D4096     0.84  True
-hf_kernels_layer_norm    LN_B16_S2048_D8192     1.65  True
-hf_kernels_layer_norm    LN_B16_S4096_D4096     1.65  True
-hf_kernels_layer_norm    LN_B16_S4096_D8192     3.26  True
+hf_kernels_layer_norm    LN_B16_S2048_D8192     1.66  True
+hf_kernels_layer_norm    LN_B16_S4096_D4096     1.66  True
+hf_kernels_layer_norm    LN_B16_S4096_D8192     3.27  True
 </pre></div>
 <div class="uv-install-logs" id="uv-logs-benchmark">
 <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 <div class="uv-logs-content" style="display: none;">
-Installed 15 packages in 12ms
+Downloading hf-xet (3.2MiB)
+ Downloading hf-xet
+Installed 52 packages in 218ms
 </div>
 </div>
 <div class="cell-stderr">Fetching 4 files:   0%|          | 0/4 [00:00&lt;?, ?it/s]
-Fetching 4 files:  50%|█████     | 2/4 [00:01&lt;00:01,  1.24it/s]
-Fetching 4 files: 100%|██████████| 4/4 [00:01&lt;00:00,  2.49it/s]</div>
+Fetching 4 files:  50%|█████     | 2/4 [00:01&lt;00:01,  1.22it/s]
+Fetching 4 files: 100%|██████████| 4/4 [00:01&lt;00:00,  2.45it/s]</div>
 <div class="cell-artifacts">
 <h4>Artifacts:</h4>
 <a href="artifacts/benchmark/layer_norm.jsonl" class="artifact" target="_blank">layer_norm.jsonl</a>
diff --git a/layer_norm/impls/torch_layer_norm.html b/layer_norm/impls/torch_layer_norm.html
index 20d9e1c56bcbf9b885e76f1253898e95406c005a..1205e964abde5a00c2c82d107fdba63f6b4fce51 100644
--- a/layer_norm/impls/torch_layer_norm.html
+++ b/layer_norm/impls/torch_layer_norm.html
@@ -4106,11 +4106,11 @@ body[data-tool="eraser"] .main-content {
 <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> | 
-Cell: nv | 0.26s
+Cell: nv | 0.23s
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
 <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
-<a href="https://github.com/huggingface/kernels-uvnotes/blob/main/layer_norm/impls/torch_layer_norm.md" target="_blank" class="github-btn">GitHub</a>
+<a href="https://github.com/huggingface/kernels-benchmarks/blob/main/layer_norm/impls/torch_layer_norm.md" target="_blank" class="github-btn">GitHub</a>
 </div>
 <div id="code-nv" class="cell-code" data-lines="2">
 <div class="code-wrap">
@@ -4122,7 +4122,7 @@ Cell: nv | 0.26s
 </div>
 </div>
 <div id="output-nv" class="cell-output">
-<div class="cell-stdout"><pre class="stdout-text">Thu Oct 30 15:52:47 2025       
+<div class="cell-stdout"><pre class="stdout-text">Fri Oct 31 20:00:08 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -4131,7 +4131,7 @@ Cell: nv | 0.26s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   33C    P0            139W /  350W |       0MiB /  46068MiB |    100%      Default |
+| N/A   32C    P0             85W /  350W |       0MiB /  46068MiB |     22%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -4153,13 +4153,13 @@ Cell: nv | 0.26s
 <span class="collapse-indicators">
 <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span> 
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
- <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
+ <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> | 
-Cell: benchmark | 7.42s
+Cell: benchmark | 3.89s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
-<a href="https://github.com/huggingface/kernels-uvnotes/blob/main/layer_norm/impls/torch_layer_norm.md" target="_blank" class="github-btn">GitHub</a>
+<a href="https://github.com/huggingface/kernels-benchmarks/blob/main/layer_norm/impls/torch_layer_norm.md" target="_blank" class="github-btn">GitHub</a>
 </div>
 <div id="code-benchmark" class="cell-code" data-lines="26">
 <div class="code-wrap">
@@ -4203,19 +4203,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S2048_D4096
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         3.91%     153.364us        46.27%       1.815ms       1.815ms       0.000us         0.00%       3.039ms       3.039ms             1  
-                                       aten::layer_norm         0.42%      16.299us        42.36%       1.661ms     553.716us       0.000us         0.00%       3.039ms       1.013ms             3  
-                                aten::native_layer_norm         2.01%      79.002us        41.94%       1.645ms     548.283us       2.327ms       100.00%       3.039ms       1.013ms             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       2.329ms       100.06%       2.329ms       2.329ms             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       2.327ms       100.00%       2.327ms     775.829us             3  
-                                Activity Buffer Request        37.33%       1.464ms        37.33%       1.464ms       1.464ms     711.872us        30.59%     711.872us     711.872us             1  
-                                            aten::empty         1.19%      46.781us         1.19%      46.781us       5.198us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         1.21%      47.400us         1.21%      47.400us      15.800us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.20%       7.811us         0.20%       7.811us       1.302us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        53.73%       2.107ms        53.73%       2.107ms       2.107ms       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         3.88%     150.743us        46.08%       1.790ms       1.790ms       0.000us         0.00%       3.031ms       3.031ms             1  
+                                       aten::layer_norm         0.46%      17.882us        42.20%       1.639ms     546.344us       0.000us         0.00%       3.031ms       1.010ms             3  
+                                aten::native_layer_norm         2.05%      79.451us        41.74%       1.621ms     540.384us       2.322ms       100.00%       3.031ms       1.010ms             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       2.323ms       100.06%       2.323ms       2.323ms             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       2.322ms       100.00%       2.322ms     773.873us             3  
+                                Activity Buffer Request        37.13%       1.442ms        37.13%       1.442ms       1.442ms     709.660us        30.57%     709.660us     709.660us             1  
+                                            aten::empty         1.23%      47.623us         1.23%      47.623us       5.291us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         1.17%      45.281us         1.17%      45.281us      15.094us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.17%       6.710us         0.17%       6.710us       1.118us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        53.92%       2.094ms        53.92%       2.094ms       2.094ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 3.922ms
-Self CUDA time total: 2.327ms
+Self CPU time total: 3.884ms
+Self CUDA time total: 2.322ms
 
 
 
@@ -4225,19 +4225,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S2048_D8192
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         1.15%      73.661us        25.36%       1.626ms       1.626ms       0.000us         0.00%       6.533ms       6.533ms             1  
-                                       aten::layer_norm         0.14%       8.791us        24.21%       1.552ms     517.499us       0.000us         0.00%       6.533ms       2.178ms             3  
-                                aten::native_layer_norm         0.79%      50.951us        24.07%       1.544ms     514.569us       4.920ms       100.00%       6.533ms       2.178ms             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       4.922ms       100.03%       4.922ms       4.922ms             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       4.920ms       100.00%       4.920ms       1.640ms             3  
-                                Activity Buffer Request        22.34%       1.433ms        22.34%       1.433ms       1.433ms       1.613ms        32.78%       1.613ms       1.613ms             1  
-                                            aten::empty         0.45%      28.941us         0.45%      28.941us       3.216us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         0.43%      27.430us         0.43%      27.430us       9.143us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.06%       3.590us         0.06%       3.590us       0.598us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        74.64%       4.787ms        74.64%       4.787ms       4.787ms       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         1.99%     129.362us        27.22%       1.769ms       1.769ms       0.000us         0.00%       6.490ms       6.490ms             1  
+                                       aten::layer_norm         0.17%      10.831us        25.23%       1.640ms     546.698us       0.000us         0.00%       6.490ms       2.163ms             3  
+                                aten::native_layer_norm         0.91%      59.414us        25.06%       1.629ms     543.087us       4.900ms       100.00%       6.490ms       2.163ms             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       4.901ms       100.03%       4.901ms       4.901ms             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       4.900ms       100.00%       4.900ms       1.633ms             3  
+                                Activity Buffer Request        23.14%       1.504ms        23.14%       1.504ms       1.504ms       1.590ms        32.46%       1.590ms       1.590ms             1  
+                                            aten::empty         0.46%      29.779us         0.46%      29.779us       3.309us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         0.49%      31.860us         0.49%      31.860us      10.620us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.06%       3.750us         0.06%       3.750us       0.625us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        72.78%       4.732ms        72.78%       4.732ms       4.732ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 6.413ms
-Self CUDA time total: 4.920ms
+Self CPU time total: 6.501ms
+Self CUDA time total: 4.900ms
 
 
 
@@ -4247,19 +4247,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S4096_D4096
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         1.10%      68.311us        26.09%       1.619ms       1.619ms       0.000us         0.00%       6.232ms       6.232ms             1  
-                                       aten::layer_norm         0.13%       8.220us        24.99%       1.551ms     516.952us       0.000us         0.00%       6.232ms       2.077ms             3  
-                                aten::native_layer_norm         0.83%      51.401us        24.86%       1.543ms     514.212us       4.714ms       100.00%       6.232ms       2.077ms             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       4.716ms       100.03%       4.716ms       4.716ms             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       4.714ms       100.00%       4.714ms       1.571ms             3  
-                                Activity Buffer Request        23.07%       1.432ms        23.07%       1.432ms       1.432ms       1.518ms        32.20%       1.518ms       1.518ms             1  
-                                            aten::empty         0.45%      27.641us         0.45%      27.641us       3.071us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         0.45%      27.961us         0.45%      27.961us       9.320us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.06%       3.720us         0.06%       3.720us       0.620us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        73.91%       4.587ms        73.91%       4.587ms       4.587ms       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         1.73%     108.072us        26.73%       1.674ms       1.674ms       0.000us         0.00%       6.258ms       6.258ms             1  
+                                       aten::layer_norm         0.14%       8.910us        25.01%       1.566ms     522.010us       0.000us         0.00%       6.258ms       2.086ms             3  
+                                aten::native_layer_norm         0.87%      54.314us        24.86%       1.557ms     519.040us       4.736ms       100.00%       6.258ms       2.086ms             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       4.737ms       100.03%       4.737ms       4.737ms             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       4.736ms       100.00%       4.736ms       1.579ms             3  
+                                Activity Buffer Request        23.05%       1.444ms        23.05%       1.444ms       1.444ms       1.522ms        32.13%       1.522ms       1.522ms             1  
+                                            aten::empty         0.46%      28.531us         0.46%      28.531us       3.170us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         0.43%      26.620us         0.43%      26.620us       8.873us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.06%       4.039us         0.06%       4.039us       0.673us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        73.27%       4.589ms        73.27%       4.589ms       4.589ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 6.206ms
-Self CUDA time total: 4.714ms
+Self CPU time total: 6.263ms
+Self CUDA time total: 4.736ms
 
 
 
@@ -4269,33 +4269,27 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S4096_D8192
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                       torch_layer_norm         0.61%      68.882us        14.40%       1.628ms       1.628ms       0.000us         0.00%      13.066ms      13.066ms             1  
-                                       aten::layer_norm         0.08%       8.939us        13.79%       1.559ms     519.662us       0.000us         0.00%      13.066ms       4.355ms             3  
-                                aten::native_layer_norm         0.44%      49.281us        13.71%       1.550ms     516.682us       9.830ms       100.00%      13.066ms       4.355ms             3  
-                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       9.831ms       100.01%       9.831ms       9.831ms             1  
-void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       9.830ms       100.00%       9.830ms       3.277ms             3  
-                                Activity Buffer Request        11.27%       1.275ms        11.27%       1.275ms       1.275ms       3.236ms        32.92%       3.236ms       3.236ms             1  
-                                            aten::empty         0.25%      28.400us         0.25%      28.400us       3.156us       0.000us         0.00%       0.000us       0.000us             9  
-                                       cudaLaunchKernel         1.71%     193.833us         1.71%     193.833us      64.611us       0.000us         0.00%       0.000us       0.000us             3  
-                                             aten::view         0.03%       3.811us         0.03%       3.811us       0.635us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        85.60%       9.678ms        85.60%       9.678ms       9.678ms       0.000us         0.00%       0.000us       0.000us             1  
+                                       torch_layer_norm         0.85%     101.562us        19.08%       2.285ms       2.285ms       0.000us         0.00%      13.093ms      13.093ms             1  
+                                       aten::layer_norm         0.08%       9.511us        18.23%       2.184ms     727.942us       0.000us         0.00%      13.093ms       4.364ms             3  
+                                aten::native_layer_norm         0.48%      57.051us        18.15%       2.174ms     724.772us       9.846ms       100.00%      13.093ms       4.364ms             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       9.848ms       100.01%       9.848ms       9.848ms             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       9.846ms       100.00%       9.846ms       3.282ms             3  
+                                Activity Buffer Request        11.95%       1.431ms        11.95%       1.431ms       1.431ms       3.247ms        32.97%       3.247ms       3.247ms             1  
+                                            aten::empty         0.24%      29.142us         0.24%      29.142us       3.238us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         5.45%     653.217us         5.45%     653.217us     217.739us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.03%       3.890us         0.03%       3.890us       0.648us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        80.92%       9.693ms        80.92%       9.693ms       9.693ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 11.306ms
-Self CUDA time total: 9.830ms
+Self CPU time total: 11.979ms
+Self CUDA time total: 9.846ms
 
 
 impl                     wl                  p50(ms)  ok
 torch_layer_norm         LN_B16_S2048_D4096     0.82  True
 torch_layer_norm         LN_B16_S2048_D8192     1.68  True
 torch_layer_norm         LN_B16_S4096_D4096     1.61  True
-torch_layer_norm         LN_B16_S4096_D8192     3.32  True
+torch_layer_norm         LN_B16_S4096_D8192     3.33  True
 </pre></div>
-<div class="uv-install-logs" id="uv-logs-benchmark">
-<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
-<div class="uv-logs-content" style="display: none;">
-Installed 37 packages in 236ms
-</div>
-</div>
 <div class="cell-artifacts">
 <h4>Artifacts:</h4>
 <a href="artifacts/benchmark/layer_norm.jsonl" class="artifact" target="_blank">layer_norm.jsonl</a>
diff --git a/layer_norm/results/artifacts/combine/latency.svg b/layer_norm/results/artifacts/combine/latency.svg
index 34cd8f0c706e5fc5bb73d5ee3250c97c964ebb75..3ed8cd82e2a1a2caaeef1654e62fb6cc1922ef61 100644
--- a/layer_norm/results/artifacts/combine/latency.svg
+++ b/layer_norm/results/artifacts/combine/latency.svg
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fae823f30e52d7309b2e012b577544ab4911a33cc2d4ec0acdc57866ceb942fa
-size 14647
+oid sha256:be29ece5a8e85e2941ac21710ec16efd87996aaf0e9b42756a2189660af81a2c
+size 14642
diff --git a/layer_norm/results/combined_results.html b/layer_norm/results/combined_results.html
index 2ea1962828136cb4445c6c556c5218357072c492..37977b7fe79b0f6b62ae7797328594688de81469 100644
--- a/layer_norm/results/combined_results.html
+++ b/layer_norm/results/combined_results.html
@@ -4107,7 +4107,7 @@ body[data-tool="eraser"] .main-content {
   <rdf:RDF>
    <ns2:Work>
     <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
-    <dc:date>2025-10-30T15:53:45.192018</dc:date>
+    <dc:date>2025-10-31T20:13:56.885734</dc:date>
     <dc:format>image/svg+xml</dc:format>
     <dc:creator>
      <ns2:Agent>
@@ -4191,70 +4191,70 @@ body[data-tool="eraser"] .main-content {
    <g id="matplotlib.axis_2">
     <g id="ytick_1">
      <g id="grid-y--2" class="grid grid-y">
-      <path d="M 47.72 409.256777  L 840.20233 409.256777  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 409.237714  L 840.20233 409.237714  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_5">
       <defs>
        <path id="m0fca2865ba" d="M 0 0  L -3.5 0  " style="stroke: #000000; stroke-width: 0.8" />
       </defs>
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="409.256777" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="409.237714" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_5">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="413.055996" transform="rotate(-0 40.72 413.055996)">1.0</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="413.036933" transform="rotate(-0 40.72 413.036933)">1.0</text>
      </g>
     </g>
     <g id="ytick_2">
      <g id="grid-y--3" class="grid grid-y">
-      <path d="M 47.72 331.172592  L 840.20233 331.172592  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 331.316879  L 840.20233 331.316879  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_6">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="331.172592" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="331.316879" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_6">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="334.971811" transform="rotate(-0 40.72 334.971811)">1.5</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="335.116098" transform="rotate(-0 40.72 335.116098)">1.5</text>
      </g>
     </g>
     <g id="ytick_3">
      <g id="grid-y--4" class="grid grid-y">
-      <path d="M 47.72 253.088408  L 840.20233 253.088408  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 253.396045  L 840.20233 253.396045  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_7">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="253.088408" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="253.396045" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_7">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="256.887626" transform="rotate(-0 40.72 256.887626)">2.0</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="257.195264" transform="rotate(-0 40.72 257.195264)">2.0</text>
      </g>
     </g>
     <g id="ytick_4">
      <g id="grid-y--5" class="grid grid-y">
-      <path d="M 47.72 175.004223  L 840.20233 175.004223  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 175.47521  L 840.20233 175.47521  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_8">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="175.004223" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="175.47521" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_8">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="178.803442" transform="rotate(-0 40.72 178.803442)">2.5</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="179.274429" transform="rotate(-0 40.72 179.274429)">2.5</text>
      </g>
     </g>
     <g id="ytick_5">
      <g id="grid-y--6" class="grid grid-y">
-      <path d="M 47.72 96.920038  L 840.20233 96.920038  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 97.554376  L 840.20233 97.554376  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_9">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="96.920038" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="97.554376" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_9">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="100.719257" transform="rotate(-0 40.72 100.719257)">3.0</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="101.353595" transform="rotate(-0 40.72 101.353595)">3.0</text>
      </g>
     </g>
     <g id="label--y" class="ylabel">
@@ -4262,27 +4262,27 @@ body[data-tool="eraser"] .main-content {
     </g>
    </g>
    <g id="series--torch-layer-norm" class="series">
-    <path d="M 83.741924 437.689571  L 323.888085 303.103046  L 564.034245 314.275643  L 804.180406 46.442361  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 83.741924 437.689571  L 323.888085 303.419195  L 564.034245 314.859843  L 804.180406 46.442361  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="md7efaf3aec" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #1f77b4" />
     </defs>
     <g clip-path="url(#p2214f54723)">
      <use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="323.888085" y="303.103046" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="564.034245" y="314.275643" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="323.888085" y="303.419195" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="564.034245" y="314.859843" style="fill: #1f77b4; stroke: #1f77b4" />
      <use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
     </g>
    </g>
    <g id="series--hf-kernels-layer-norm" class="series">
-    <path d="M 83.741924 434.541217  L 323.888085 307.897415  L 564.034245 308.267534  L 804.180406 56.232243  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 83.741924 434.525986  L 323.888085 307.036436  L 564.034245 306.425536  L 804.180406 56.12044  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="m9b8c54d372" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #ff7f0e" />
     </defs>
     <g clip-path="url(#p2214f54723)">
-     <use ns4:href="#m9b8c54d372" x="83.741924" y="434.541217" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="323.888085" y="307.897415" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="564.034245" y="308.267534" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="804.180406" y="56.232243" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="83.741924" y="434.525986" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="323.888085" y="307.036436" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="564.034245" y="306.425536" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="804.180406" y="56.12044" style="fill: #ff7f0e; stroke: #ff7f0e" />
     </g>
    </g>
    <g id="patch_3">
@@ -4428,13 +4428,13 @@ COMBINED BENCHMARK SUMMARY
 
 impl                     wl                  p50(ms)  ok
 hf_kernels_layer_norm    LN_B16_S2048_D4096     0.84  True
-hf_kernels_layer_norm    LN_B16_S2048_D8192     1.65  True
-hf_kernels_layer_norm    LN_B16_S4096_D4096     1.65  True
-hf_kernels_layer_norm    LN_B16_S4096_D8192     3.26  True
+hf_kernels_layer_norm    LN_B16_S2048_D8192     1.66  True
+hf_kernels_layer_norm    LN_B16_S4096_D4096     1.66  True
+hf_kernels_layer_norm    LN_B16_S4096_D8192     3.27  True
 torch_layer_norm         LN_B16_S2048_D4096     0.82  True
 torch_layer_norm         LN_B16_S2048_D8192     1.68  True
 torch_layer_norm         LN_B16_S4096_D4096     1.61  True
-torch_layer_norm         LN_B16_S4096_D8192     3.32  True
+torch_layer_norm         LN_B16_S4096_D8192     3.33  True
 
 GENERATING COMBINED VISUALIZATION
 
@@ -4454,7 +4454,7 @@ Implementations included:
 <div class="uv-install-logs" id="uv-logs-combine">
 <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 <div class="uv-logs-content" style="display: none;">
-Installed 37 packages in 222ms
+Installed 37 packages in 216ms
 </div>
 </div>
 <div class="cell-artifacts">
@@ -4467,7 +4467,7 @@ Installed 37 packages in 222ms
   <rdf:RDF>
    <ns2:Work>
     <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
-    <dc:date>2025-10-30T15:53:45.192018</dc:date>
+    <dc:date>2025-10-31T20:13:56.885734</dc:date>
     <dc:format>image/svg+xml</dc:format>
     <dc:creator>
      <ns2:Agent>
@@ -4551,70 +4551,70 @@ Installed 37 packages in 222ms
    <g id="matplotlib.axis_2">
     <g id="ytick_1">
      <g id="grid-y--2" class="grid grid-y">
-      <path d="M 47.72 409.256777  L 840.20233 409.256777  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 409.237714  L 840.20233 409.237714  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_5">
       <defs>
        <path id="m0fca2865ba" d="M 0 0  L -3.5 0  " style="stroke: #000000; stroke-width: 0.8" />
       </defs>
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="409.256777" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="409.237714" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_5">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="413.055996" transform="rotate(-0 40.72 413.055996)">1.0</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="413.036933" transform="rotate(-0 40.72 413.036933)">1.0</text>
      </g>
     </g>
     <g id="ytick_2">
      <g id="grid-y--3" class="grid grid-y">
-      <path d="M 47.72 331.172592  L 840.20233 331.172592  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 331.316879  L 840.20233 331.316879  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_6">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="331.172592" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="331.316879" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_6">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="334.971811" transform="rotate(-0 40.72 334.971811)">1.5</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="335.116098" transform="rotate(-0 40.72 335.116098)">1.5</text>
      </g>
     </g>
     <g id="ytick_3">
      <g id="grid-y--4" class="grid grid-y">
-      <path d="M 47.72 253.088408  L 840.20233 253.088408  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 253.396045  L 840.20233 253.396045  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_7">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="253.088408" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="253.396045" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_7">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="256.887626" transform="rotate(-0 40.72 256.887626)">2.0</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="257.195264" transform="rotate(-0 40.72 257.195264)">2.0</text>
      </g>
     </g>
     <g id="ytick_4">
      <g id="grid-y--5" class="grid grid-y">
-      <path d="M 47.72 175.004223  L 840.20233 175.004223  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 175.47521  L 840.20233 175.47521  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_8">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="175.004223" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="175.47521" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_8">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="178.803442" transform="rotate(-0 40.72 178.803442)">2.5</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="179.274429" transform="rotate(-0 40.72 179.274429)">2.5</text>
      </g>
     </g>
     <g id="ytick_5">
      <g id="grid-y--6" class="grid grid-y">
-      <path d="M 47.72 96.920038  L 840.20233 96.920038  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 97.554376  L 840.20233 97.554376  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_9">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="96.920038" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="97.554376" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_9">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="100.719257" transform="rotate(-0 40.72 100.719257)">3.0</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="101.353595" transform="rotate(-0 40.72 101.353595)">3.0</text>
      </g>
     </g>
     <g id="label--y" class="ylabel">
@@ -4622,27 +4622,27 @@ Installed 37 packages in 222ms
     </g>
    </g>
    <g id="series--torch-layer-norm" class="series">
-    <path d="M 83.741924 437.689571  L 323.888085 303.103046  L 564.034245 314.275643  L 804.180406 46.442361  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 83.741924 437.689571  L 323.888085 303.419195  L 564.034245 314.859843  L 804.180406 46.442361  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="md7efaf3aec" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #1f77b4" />
     </defs>
     <g clip-path="url(#p2214f54723)">
      <use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="323.888085" y="303.103046" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="564.034245" y="314.275643" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="323.888085" y="303.419195" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="564.034245" y="314.859843" style="fill: #1f77b4; stroke: #1f77b4" />
      <use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
     </g>
    </g>
    <g id="series--hf-kernels-layer-norm" class="series">
-    <path d="M 83.741924 434.541217  L 323.888085 307.897415  L 564.034245 308.267534  L 804.180406 56.232243  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 83.741924 434.525986  L 323.888085 307.036436  L 564.034245 306.425536  L 804.180406 56.12044  " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="m9b8c54d372" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #ff7f0e" />
     </defs>
     <g clip-path="url(#p2214f54723)">
-     <use ns4:href="#m9b8c54d372" x="83.741924" y="434.541217" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="323.888085" y="307.897415" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="564.034245" y="308.267534" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="804.180406" y="56.232243" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="83.741924" y="434.525986" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="323.888085" y="307.036436" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="564.034245" y="306.425536" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="804.180406" y="56.12044" style="fill: #ff7f0e; stroke: #ff7f0e" />
     </g>
    </g>
    <g id="patch_3">
diff --git a/openai_moe/impls/artifacts/benchmark/openai_moe.jsonl b/openai_moe/impls/artifacts/benchmark/openai_moe.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..777cf4efb2a6108a3b57a081de8b6b9f1ee3abd3
--- /dev/null
+++ b/openai_moe/impls/artifacts/benchmark/openai_moe.jsonl
@@ -0,0 +1,8 @@
+{"ts": "2025-10-31T20:01:48Z", "run": "cee70b6f35064c71bc12a633683f7c01", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_E2", "batch": 1, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 149.85902099999748, "p50": 150.05062800003088, "p90": 150.2997029999733, "mean": 150.08009959999526, "iqr": 0.4259410000031494, "raw_times": [149.85902099999748, 150.3173840000045, 150.2997029999733, 149.87376199997016, 150.05062800003088], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 150.9511389999716, "peak_bytes": 416866816, "ok": true, "absmax": 2.765655517578125e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 2.765655517578125e-05, "mae": 2.0696452338597737e-06, "mse": 7.332408985538663e-12, "ref": "naive_moe"}, "err": null}
+{"ts": "2025-10-31T20:02:12Z", "run": "cee70b6f35064c71bc12a633683f7c01", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_E4", "batch": 1, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 199.76808500001653, "p50": 200.257487999977, "p90": 201.3672960000008, "mean": 200.6008808000047, "iqr": 1.3947150000035435, "raw_times": [200.257487999977, 201.63895400003184, 201.3672960000008, 199.97258099999726, 199.76808500001653], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 200.2076969999962, "peak_bytes": 632035840, "ok": true, "absmax": 1.621246337890625e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.621246337890625e-05, "mae": 9.61917862696282e-07, "mse": 1.59423277530657e-12, "ref": "naive_moe"}, "err": null}
+{"ts": "2025-10-31T20:02:55Z", "run": "cee70b6f35064c71bc12a633683f7c01", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S1024_E2", "batch": 1, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 370.4508769999961, "p50": 372.7904090000038, "p90": 374.84007900002325, "mean": 372.8004498000132, "iqr": 3.7740770000027624, "raw_times": [374.84007900002325, 371.0660020000205, 370.4508769999961, 374.85488200002237, 372.7904090000038], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 371.103493000021, "peak_bytes": 643844608, "ok": true, "absmax": 2.6226043701171875e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 2.6226043701171875e-05, "mae": 2.0501920516835526e-06, "mse": 7.1848811622476916e-12, "ref": "naive_moe"}, "err": null}
+{"ts": "2025-10-31T20:03:43Z", "run": "cee70b6f35064c71bc12a633683f7c01", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S1024_E4", "batch": 1, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 381.2919249999709, "p50": 382.6824700000202, "p90": 382.6975609999863, "mean": 382.48455139998896, "iqr": 0.3518089999943186, "raw_times": [382.345751999992, 381.2919249999709, 383.4050489999754, 382.6975609999863, 382.6824700000202], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 384.12325699999883, "peak_bytes": 823386112, "ok": true, "absmax": 1.3589859008789062e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.3589859008789062e-05, "mae": 9.400179123986163e-07, "mse": 1.5130355735665235e-12, "ref": "naive_moe"}, "err": null}
+{"ts": "2025-10-31T20:05:12Z", "run": "cee70b6f35064c71bc12a633683f7c01", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_S512_E2", "batch": 4, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 735.1488859999336, "p50": 742.0204380000541, "p90": 746.9078719999516, "mean": 742.4016768000001, "iqr": 5.8942259998957525, "raw_times": [746.9175420000056, 746.9078719999516, 742.0204380000541, 735.1488859999336, 741.0136460000558], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 715.4345070000545, "peak_bytes": 1036112384, "ok": true, "absmax": 3.2901763916015625e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 3.2901763916015625e-05, "mae": 2.0572656467265915e-06, "mse": 7.247809123700488e-12, "ref": "naive_moe"}, "err": null}
+{"ts": "2025-10-31T20:06:54Z", "run": "cee70b6f35064c71bc12a633683f7c01", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_S512_E4", "batch": 4, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 799.7175619999553, "p50": 801.8970370000034, "p90": 803.0568570000014, "mean": 801.7179149999947, "iqr": 2.358569999955762, "raw_times": [799.7175619999553, 800.6982870000456, 803.2198319999679, 803.0568570000014, 801.8970370000034], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 797.9236759999822, "peak_bytes": 1235263488, "ok": true, "absmax": 1.430511474609375e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.430511474609375e-05, "mae": 9.400343401466671e-07, "mse": 1.5107844445957919e-12, "ref": "naive_moe"}, "err": null}
+{"ts": "2025-10-31T20:09:51Z", "run": "cee70b6f35064c71bc12a633683f7c01", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_S1024_E2", "batch": 4, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1484.2085319999114, "p50": 1486.4837999999736, "p90": 1487.529773999995, "mean": 1488.3352192000075, "iqr": 2.3281069999256943, "raw_times": [1498.252323000088, 1486.4837999999736, 1484.2085319999114, 1485.2016670000694, 1487.529773999995], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1502.5766269999394, "peak_bytes": 1861947904, "ok": true, "absmax": 2.6226043701171875e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 2.6226043701171875e-05, "mae": 2.060702854578267e-06, "mse": 7.262949790198814e-12, "ref": "naive_moe"}, "err": null}
+{"ts": "2025-10-31T20:13:14Z", "run": "cee70b6f35064c71bc12a633683f7c01", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_S1024_E4", "batch": 4, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1520.7084719999102, "p50": 1524.500331000013, "p90": 1525.4868470000247, "mean": 1524.7435091999705, "iqr": 1.6920530000561484, "raw_times": [1529.2271019999362, 1524.500331000013, 1523.7947939999685, 1525.4868470000247, 1520.7084719999102], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1532.9394789999924, "peak_bytes": 2062163968, "ok": true, "absmax": 1.5974044799804688e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.5974044799804688e-05, "mae": 9.529014732834185e-07, "mse": 1.5621694476192216e-12, "ref": "naive_moe"}, "err": null}
diff --git a/openai_moe/impls/binned_torch.html b/openai_moe/impls/binned_torch.html
new file mode 100644
index 0000000000000000000000000000000000000000..a785c694cb0f0b00ce5a5c1d57f2f2717b2be01d
--- /dev/null
+++ b/openai_moe/impls/binned_torch.html
@@ -0,0 +1,4584 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>binned_torch</title>
+
+    <link rel="preconnect" href="https://fonts.googleapis.com">
+    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+    <link href="https://fonts.googleapis.com/css2?family=JetBrains+Mono:ital,wght@0,100..800;1,100..800&display=swap" rel="stylesheet">
+
+    <script>
+// Iframe-friendly navigation router
+        (function() {
+            const isIframe = window.self !== window.top;
+            if (!isIframe) return; // Only activate in iframe context
+
+            // On load: if hash points to a different page, navigate there
+            const hash = window.location.hash;
+            if (hash && hash.startsWith('#/')) {
+                const targetPath = hash.slice(2); // Remove '#/'
+                const currentPath = window.location.pathname.split('/').pop();
+
+                // Only navigate if we're not already on the target page
+                if (targetPath !== currentPath) {
+                    window.location.href = targetPath;
+                    return; // Stop execution, we're navigating away
+                }
+            }
+
+            // Intercept all link clicks for hash-based navigation
+            document.addEventListener('click', function(e) {
+                const link = e.target.closest('a');
+                if (!link) return;
+
+                const href = link.getAttribute('href');
+
+                // Skip external links, anchors, and javascript: links
+                if (!href || href.startsWith('#') || href.startsWith('http') || href.startsWith('javascript:')) {
+                    return;
+                }
+
+                e.preventDefault();
+
+                // Convert relative/absolute path to hash-based navigation
+                const url = new URL(href, window.location.href);
+                let fullPath = url.pathname;
+
+                // Remove leading slash if present for cleaner paths
+                if (fullPath.startsWith('/')) {
+                    fullPath = fullPath.slice(1);
+                }
+
+                // Update parent URL hash
+                window.location.hash = '#/' + fullPath;
+
+                // For HTML files, navigate within iframe
+                if (fullPath.endsWith('.html') || fullPath.endsWith('/')) {
+                    const pathParts = fullPath.split('/').filter(p => p);
+                    const targetFile = pathParts[pathParts.length - 1] || 'index.html';
+                    window.location.href = targetFile;
+                } else {
+                    // For non-HTML files (raw .py, etc), open directly
+                    window.open(href, '_blank');
+                }
+            });
+        })();
+
+        // Apply theme and widget visibility immediately to prevent flicker
+        (function() {
+            const configTheme = 'dark';
+            const hasConfigUi = false;
+            const configUi = hasConfigUi ? 'None' : null;
+            const hasWidgetsConfig = false;
+            const widgetsOn = hasWidgetsConfig ? false : true;
+            let theme;
+            if (configTheme === 'auto') {
+                theme = window.matchMedia('(prefers-color-scheme: dark)').matches ? 'dark' : 'light';
+            } else {
+                theme = localStorage.getItem('uvnote-theme') || configTheme;
+            }
+            document.documentElement.setAttribute('data-theme', theme);
+
+            // Initialize UI theme (css theme)
+            let ui = hasConfigUi ? configUi : (localStorage.getItem('uvnote-ui') || 'default');
+            if (ui !== 'default' && ui !== 'none' && ui !== 'monocolor') { ui = 'default'; }
+            document.documentElement.setAttribute('data-ui', ui);
+
+            // Apply widgets visibility
+            document.documentElement.setAttribute('data-widgets', widgetsOn ? 'on' : 'off');
+        })();
+    </script>
+    <style>
+:root[data-theme="light"] {
+    --bg-primary: #ffffff;
+    --bg-secondary: #f6f8fa;
+    --bg-tertiary: #f8f9fa;
+    --bg-code: #f8f9fa;
+    --bg-error: #fdf2f2;
+    --bg-artifact: #e6f3ff;
+    --bg-artifact-hover: #d0e7ff;
+
+    --text-primary: #333;
+    --text-secondary: #656d76;
+    --text-error: #c53030;
+    --text-link: #0969da;
+
+    --border-primary: #e1e5e9;
+    --border-error: #e53e3e;
+    --border-cell-failed: #d73a49;
+
+    --shadow: rgba(0, 0, 0, 0.1);
+}
+
+:root[data-theme="dark"] {
+    --bg-primary: #0a0a0a;
+    --bg-secondary: #121212;
+    --bg-tertiary: #181818;
+    --bg-code: #0d0d0d;
+    --bg-error: #1a0f0f;
+    --bg-artifact: #151515;
+    --bg-artifact-hover: #1a1a1a;
+
+    --text-primary: #e0e0e0;
+    --text-secondary: #888888;
+    --text-error: #ff6b6b;
+    --text-link: #64b5f6;
+
+    --border-primary: #2a2a2a;
+    --border-error: #ff6b6b;
+    --border-cell-failed: #ff6b6b;
+
+    --shadow: rgba(255, 255, 255, 0.05);
+}
+
+/* Monocolor UI theme: black/white background, all text/borders single blue */
+:root[data-ui="monocolor"] {
+    --mono-color: #0a66ff;
+}
+
+:root[data-ui="monocolor"][data-theme="light"] {
+    --bg-primary: #ffffff;
+}
+
+:root[data-ui="monocolor"][data-theme="dark"] {
+    --bg-primary: #000000;
+}
+
+:root[data-ui="monocolor"] {
+    --bg-secondary: var(--bg-primary);
+    --bg-tertiary: var(--bg-primary);
+    --bg-code: var(--bg-primary);
+    --bg-error: var(--bg-primary);
+    --bg-artifact: var(--bg-primary);
+    --bg-artifact-hover: var(--bg-primary);
+
+    --text-primary: var(--mono-color);
+    --text-secondary: var(--mono-color);
+    --text-error: var(--mono-color);
+    --text-link: var(--mono-color);
+
+    --border-primary: var(--mono-color);
+    --border-error: var(--mono-color);
+    --border-cell-failed: var(--mono-color);
+
+    --shadow: none;
+}
+
+:root[data-ui="monocolor"] a {
+    color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .menu-button,
+:root[data-ui="monocolor"] .theme-toggle,
+:root[data-ui="monocolor"] .reset-toggle,
+:root[data-ui="monocolor"] .back-button {
+    background: var(--bg-primary);
+    color: var(--mono-color);
+    border-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .menu-button:hover,
+:root[data-ui="monocolor"] .theme-toggle:hover,
+:root[data-ui="monocolor"] .reset-toggle:hover,
+:root[data-ui="monocolor"] .back-button:hover {
+    background: var(--bg-primary);
+    color: var(--mono-color);
+    border-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .menu-dropdown {
+    background: var(--bg-primary);
+    border-color: var(--mono-color);
+    box-shadow: none;
+}
+
+:root[data-ui="monocolor"] .menu-item {
+    color: var(--mono-color);
+    border-bottom-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .system-info {
+    background: var(--bg-primary);
+    border-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .cell {
+    border-color: var(--mono-color);
+    background: var(--bg-primary);
+}
+
+:root[data-ui="monocolor"] .cell-header {
+    background: var(--bg-primary);
+    border-bottom-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .artifact {
+    background: var(--bg-primary);
+    border-color: var(--mono-color);
+    color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .artifact:hover {
+    background: var(--bg-primary);
+}
+
+:root[data-ui="monocolor"] .artifact-preview img,
+:root[data-ui="monocolor"] .artifact-preview svg {
+    border-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .status-widget {
+    background: var(--bg-primary);
+    border-color: var(--mono-color);
+    color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .minimap,
+:root[data-ui="monocolor"] .file-explorer,
+:root[data-ui="monocolor"] .tools-widget {
+    background: var(--bg-primary);
+    border-color: var(--mono-color);
+    color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .cell-code {
+    background: var(--bg-primary);
+    border-bottom-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .tools-title,
+:root[data-ui="monocolor"] .file-explorer-section-title,
+:root[data-ui="monocolor"] .minimap-title {
+    color: var(--mono-color);
+    border-bottom-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .tool-button {
+    background: var(--bg-primary);
+    border-color: var(--mono-color);
+    color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .tool-button.active {
+    border-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .file-explorer-item,
+:root[data-ui="monocolor"] .minimap-item {
+    color: var(--mono-color);
+}
+
+/* Force Pygments code to mono blue on mono bg */
+:root[data-ui="monocolor"] .highlight {
+    background: var(--bg-primary) !important;
+    color: var(--mono-color) !important;
+}
+
+:root[data-ui="monocolor"] .highlight *,
+:root[data-ui="monocolor"] .highlight .hll {
+    color: var(--mono-color) !important;
+    background: transparent !important;
+    border-color: var(--mono-color) !important;
+}
+
+/* Default code font + metrics (overridable via frontmatter) */
+:root {
+    --code-font-size: 0.95rem;
+    --code-line-height: 1.5;
+    --code-pad-y: 0.75rem;
+}
+
+/* Minimal UI theme overrides base variables for a flatter, 90s look */
+:root[data-ui="none"] {
+    --bg-primary: #ffffff;
+    --bg-secondary: transparent;
+    --bg-tertiary: transparent;
+    --bg-code: #f9f9f9;
+    --bg-error: #fff0f0;
+    --bg-artifact: #f0f7ff;
+    --bg-artifact-hover: #e5f1ff;
+
+    --text-primary: #000000;
+    --text-secondary: #222222;
+    --text-error: #a00000;
+    --text-link: #0000ee;
+
+    --border-primary: #cccccc;
+    --border-error: #cc0000;
+    --border-cell-failed: #cc0000;
+
+    --shadow: none;
+}
+
+html {
+    overscroll-behavior: none;
+}
+
+body {
+    font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace;
+    line-height: 1.4;
+    max-width: 1000px;
+    margin: 0 auto;
+    padding: 15px;
+    color: var(--text-primary);
+    background: var(--bg-primary);
+    transition: background-color 0.2s ease, color 0.2s ease;
+    overscroll-behavior: none;
+}
+
+/* Minimal "none" UI theme overrides */
+:root[data-ui="none"] body {
+    font-family: 'Times New Roman', Times, serif;
+    line-height: 1.5;
+    max-width: 860px;
+    padding: 12px;
+    background: #ffffff;
+    color: #000000;
+    transition: none;
+}
+
+/* Two panel layout removed */
+
+.controls {
+    position: fixed;
+    top: 20px;
+    right: 20px;
+    display: flex;
+    flex-direction: column;
+    align-items: flex-end;
+    gap: 0.25rem;
+    z-index: 1000;
+}
+
+.controls-buttons {
+    display: flex;
+    gap: 0.5rem;
+}
+
+.menu-button {
+    position: relative;
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    padding: 8px 12px;
+    border-radius: 2px;
+    color: var(--text-secondary);
+    cursor: pointer;
+    font-family: inherit;
+    font-size: 0.9rem;
+    user-select: none;
+}
+
+/* Keep default control styling when widgets are enabled, even in minimal UI */
+:root[data-ui="none"][data-widgets="on"] .menu-button,
+:root[data-ui="none"][data-widgets="on"] .theme-toggle,
+:root[data-ui="none"][data-widgets="on"] .reset-toggle,
+:root[data-ui="none"][data-widgets="on"] .back-button {
+    background: #f6f6f6;
+    border: 1px solid #cccccc;
+    color: #222222;
+}
+
+.menu-button:hover {
+    color: var(--text-primary);
+    background: var(--bg-tertiary);
+}
+
+/* Controls state indicator (top-right) */
+/* Status widget (bottom-right) */
+.status-widget {
+    position: fixed;
+    right: 20px;
+    bottom: 20px;
+    width: auto;
+    max-width: 260px;
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    border-radius: 2px;
+    padding: 6px 8px;
+    font-size: 0.8rem;
+    color: var(--text-secondary);
+    z-index: 100;
+}
+
+.status-widget strong {
+    color: var(--text-primary);
+}
+
+:root[data-ui="none"][data-widgets="on"] .status-widget {
+    background: #f6f6f6;
+    border-color: #ccc;
+    color: #222;
+}
+
+:root[data-ui="none"][data-widgets="on"] .menu-button:hover,
+:root[data-ui="none"][data-widgets="on"] .theme-toggle:hover,
+:root[data-ui="none"][data-widgets="on"] .reset-toggle:hover,
+:root[data-ui="none"][data-widgets="on"] .back-button:hover {
+    background: #ededed;
+    border-color: #bbbbbb;
+    color: #000000;
+}
+
+.menu-dropdown {
+    position: absolute;
+    top: 100%;
+    right: 0;
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    border-radius: 4px;
+    box-shadow: 0 4px 12px var(--shadow);
+    min-width: 160px;
+    opacity: 0;
+    visibility: hidden;
+    transform: translateY(-8px);
+    transition: all 0.2s ease;
+    z-index: 1001;
+    margin-top: 4px;
+}
+
+:root[data-ui="none"][data-widgets="on"] .menu-dropdown {
+    background: #ffffff;
+    border: 1px solid #cccccc;
+    box-shadow: none;
+}
+
+.menu-button.active .menu-dropdown {
+    opacity: 1;
+    visibility: visible;
+    transform: translateY(0);
+}
+
+.menu-item {
+    display: block;
+    padding: 8px 12px;
+    color: var(--text-secondary);
+    text-decoration: none;
+    font-size: 0.85rem;
+    border-bottom: 1px solid var(--border-primary);
+    cursor: pointer;
+}
+
+:root[data-ui="none"] .menu-item {
+    color: #000;
+    border-bottom: 1px solid #eee;
+}
+
+.menu-item:last-child {
+    border-bottom: none;
+}
+
+.menu-item:hover {
+    background: var(--bg-tertiary);
+    color: var(--text-primary);
+}
+
+.menu-checkbox {
+    display: inline-block;
+    width: 16px;
+    font-family: monospace;
+    color: var(--text-link);
+}
+
+.theme-toggle,
+.reset-toggle,
+.back-button {
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    padding: 8px 12px;
+    border-radius: 4px;
+    color: var(--text-secondary);
+    cursor: pointer;
+    font-family: inherit;
+    font-size: 0.9rem;
+    user-select: none;
+}
+
+.back-button {
+    text-decoration: none;
+    display: inline-block;
+}
+
+.theme-toggle:hover,
+.reset-toggle:hover,
+.back-button:hover {
+    color: var(--text-primary);
+    background: var(--bg-tertiary);
+}
+
+.system-info {
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    border-radius: 4px;
+    padding: 8px 12px;
+    margin-bottom: 16px;
+    font-size: 0.85em;
+    color: var(--text-secondary);
+}
+
+.system-info-header {
+    font-weight: 600;
+    color: var(--text-primary);
+    margin-bottom: 2px;
+}
+
+.system-info-content {
+    font-family: monospace;
+}
+
+.theme-toggle,
+.reset-toggle {
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    border-radius: 2px;
+    /* padding: 0.4rem 0.6rem; */
+    cursor: pointer;
+    font-family: inherit;
+    font-size: 0.8rem;
+    color: var(--text-secondary);
+    user-select: none;
+    transition: all 0.2s ease;
+    text-transform: lowercase;
+    letter-spacing: 0;
+}
+
+.theme-toggle:hover,
+.reset-toggle:hover {
+    background: var(--bg-tertiary);
+    border-color: var(--text-secondary);
+    color: var(--text-primary);
+}
+
+.minimap {
+    position: fixed;
+    bottom: 20px;
+    right: 20px;
+    width: 220px;
+    max-height: 400px;
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    border-radius: 2px;
+    padding: 0.5rem;
+    font-size: 0.7rem;
+    overflow-y: auto;
+    z-index: 100;
+    opacity: 0.9;
+    transition: opacity 0.2s ease;
+}
+
+/* Hide widgets and controls when disabled via frontmatter */
+:root[data-widgets="off"] .controls,
+:root[data-widgets="off"] .minimap,
+:root[data-widgets="off"] .file-explorer,
+:root[data-widgets="off"] .tools-widget,
+:root[data-widgets="off"] .status-widget {
+    display: none !important;
+}
+
+.file-explorer {
+    position: fixed;
+    bottom: 20px;
+    /* default; JS will stack */
+    right: 20px;
+    left: auto;
+    top: auto;
+    width: 220px;
+    max-height: 400px;
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    border-radius: 2px;
+    padding: 0.5rem;
+    font-size: 0.7rem;
+    overflow-y: auto;
+    z-index: 100;
+    opacity: 0.9;
+    transition: opacity 0.2s ease;
+}
+
+/* Drawing overlay */
+.draw-overlay {
+    position: fixed;
+    top: 0;
+    left: 0;
+    width: 100vw;
+    height: 100vh;
+    z-index: 80;
+    /* under widgets (100) and controls (1000) */
+    display: block;
+    pointer-events: none;
+    /* enabled only when a tool is active */
+}
+
+/* Tools widget */
+.tools-widget {
+    position: fixed;
+    bottom: 20px;
+    /* default; JS will stack */
+    right: 20px;
+    left: auto;
+    top: auto;
+    width: 220px;
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    border-radius: 2px;
+    padding: 0.5rem;
+    font-size: 0.7rem;
+    z-index: 100;
+    opacity: 0.95;
+}
+
+.tools-title {
+    font-weight: bold;
+    color: var(--text-secondary);
+    margin-bottom: 0.5rem;
+    padding-bottom: 0.25rem;
+    border-bottom: 1px solid var(--border-primary);
+    cursor: grab;
+    user-select: none;
+}
+
+.tools-row {
+    display: flex;
+    gap: 0.4rem;
+    flex-wrap: wrap;
+}
+
+.tool-button {
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-primary);
+    border-radius: 2px;
+    padding: 0.25rem 0.4rem;
+    cursor: pointer;
+    color: var(--text-secondary);
+    font-family: inherit;
+    font-size: 0.75rem;
+    user-select: none;
+}
+
+.tool-button:hover {
+    color: var(--text-primary);
+}
+
+.tool-button.active {
+    color: var(--text-primary);
+    border-color: var(--text-secondary);
+    background: var(--bg-secondary);
+}
+
+.minimap:hover,
+.file-explorer:hover {
+    opacity: 1;
+}
+
+.minimap-title {
+    font-weight: bold;
+    color: var(--text-secondary);
+    margin-bottom: 0.5rem;
+    padding-bottom: 0.25rem;
+    border-bottom: 1px solid var(--border-primary);
+    cursor: grab;
+    /* drag handle */
+    user-select: none;
+}
+
+.minimap-item {
+    display: block;
+    color: var(--text-secondary);
+    text-decoration: none;
+    padding: 0.15rem 0;
+    border-left: 2px solid transparent;
+    padding-left: 0.5rem;
+    transition: all 0.2s ease;
+    cursor: pointer;
+}
+
+.minimap-item:hover {
+    color: var(--text-primary);
+    border-left-color: var(--text-secondary);
+}
+
+.minimap-item.active {
+    color: var(--text-primary);
+    border-left-color: var(--text-link);
+}
+
+.minimap-heading {
+    font-weight: normal;
+}
+
+.minimap-heading.h1 {
+    padding-left: 0.5rem;
+}
+
+.minimap-heading.h2 {
+    padding-left: 1rem;
+}
+
+.minimap-heading.h3 {
+    padding-left: 1.5rem;
+}
+
+.minimap-heading.h4 {
+    padding-left: 2rem;
+}
+
+.minimap-heading.h5 {
+    padding-left: 2.5rem;
+}
+
+.minimap-heading.h6 {
+    padding-left: 3rem;
+}
+
+.minimap-cell {
+    color: var(--text-link);
+    opacity: 0.8;
+    font-style: italic;
+}
+
+.minimap-cell:hover {
+    opacity: 1;
+}
+
+.file-explorer-title {
+    font-weight: bold;
+    color: var(--text-secondary);
+    margin-bottom: 0.5rem;
+    padding-bottom: 0.25rem;
+    border-bottom: 1px solid var(--border-primary);
+    cursor: grab;
+    /* drag handle */
+    user-select: none;
+}
+
+.file-explorer-section {
+    margin-bottom: 0.75rem;
+}
+
+.file-explorer-section-title {
+    font-weight: bold;
+    color: var(--text-secondary);
+    font-size: 0.65rem;
+    margin-bottom: 0.25rem;
+    text-transform: uppercase;
+    letter-spacing: 0.5px;
+}
+
+.file-explorer-item {
+    display: block;
+    color: var(--text-secondary);
+    text-decoration: none;
+    padding: 0.1rem 0;
+    margin-left: 0.5rem;
+    transition: color 0.2s ease;
+    cursor: pointer;
+    font-family: monospace;
+}
+
+.file-explorer-item:hover {
+    color: var(--text-primary);
+}
+
+.file-explorer-item.script {
+    color: var(--text-link);
+}
+
+.file-explorer-item.artifact {
+    color: var(--text-secondary);
+    opacity: 0.8;
+}
+
+
+/* Hide widgets on smaller screens */
+@media (max-width: 768px) {
+
+    .minimap,
+    .file-explorer,
+    .tools-widget {
+        display: none;
+    }
+}
+
+.cell {
+    margin: 1rem 0;
+    border: 1px solid var(--border-primary);
+    border-radius: 2px;
+    overflow: hidden;
+    background: var(--bg-secondary);
+}
+
+:root[data-ui="none"] .cell {
+    margin: 1em 0;
+    border: none;
+    background: transparent;
+}
+
+.cell-header {
+    background: var(--bg-secondary);
+    padding: 0.5rem 1rem;
+    border-bottom: 1px solid var(--border-primary);
+    font-family: inherit;
+    font-size: 0.85rem;
+}
+
+:root[data-ui="none"] .cell-header {
+    background: transparent;
+    border: none;
+    padding: 0;
+    font-weight: bold;
+}
+
+:root[data-ui="none"] .cell-content {
+    padding: 0;
+}
+
+:root[data-ui="none"] .copy-button,
+:root[data-ui="none"] .collapse-indicators,
+:root[data-ui="none"] .cell-meta,
+:root[data-ui="none"] .cell-outputs-header {
+    display: none !important;
+}
+
+:root[data-ui="none"] pre,
+:root[data-ui="none"] code {
+    font-family: Menlo, Monaco, 'Courier New', monospace;
+}
+
+:root[data-ui="none"] .code-content pre {
+    background: #f9f9f9;
+    border: 1px solid #ddd;
+    padding: 8px;
+}
+
+:root[data-ui="none"] .output {
+    background: transparent;
+    border: none;
+    padding: 0.25em 0;
+}
+
+color: var(--text-secondary);
+cursor: pointer;
+user-select: none;
+transition: background-color 0.2s ease;
+}
+
+.cell-header:hover {
+    background: var(--bg-tertiary);
+}
+
+.collapse-indicators {
+    color: var(--text-secondary);
+    font-size: 0.8rem;
+    opacity: 0.7;
+}
+
+.collapse-indicators span:hover {
+    color: var(--text-primary);
+    opacity: 1;
+}
+
+.cell-code {
+    display: block;
+    background: var(--bg-code);
+}
+
+.cell-code.collapsed {
+    display: none;
+}
+
+.cell-code pre {
+    margin: 0;
+    padding: 0.75rem;
+    background: var(--bg-code);
+    overflow-x: auto;
+    color: var(--text-primary);
+}
+
+.cell-output {
+    padding: 0.75rem;
+    /* background: var(--bg-primary); */
+    background: var(--bg-secondary);
+}
+
+.cell-output.collapsed {
+    display: none;
+}
+
+.cell-stdout {
+    background: var(--bg-tertiary);
+    padding: 0.75rem;
+    border-radius: 1px;
+    /* margin: 0.25rem 0; */
+    font-family: inherit;
+    font-size: 0.9rem;
+    white-space: pre-wrap;
+    color: var(--text-primary);
+}
+
+.cell-stdout {
+    background: var(--bg-tertiary);
+    padding: 0.75rem;
+    border-radius: 1px;
+    font-family: inherit;
+    font-size: 0.9rem;
+    color: var(--text-primary);
+
+    /* key bits */
+    overflow: auto;
+    /* show scrollbars when needed */
+    max-width: 100%;
+    /* respects whatever layout width you give it */
+}
+
+.cell-stdout .stdout-text {
+    margin: 0;
+    /* reset pre default margin */
+    white-space: pre;
+    /* keep line breaks, NO wrapping */
+    display: inline-block;
+    /* shrink-to-content */
+    min-width: max-content;
+    /* allow very long lines to define intrinsic width */
+    font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
+    tab-size: 2;
+}
+
+.cell-stderr {
+    background: var(--bg-error);
+    border-left: 2px solid var(--border-error);
+    padding: 1rem;
+    margin: 0.5rem 0;
+    font-family: inherit;
+    font-size: 0.9rem;
+    color: var(--text-error);
+    white-space: pre-wrap;
+}
+
+.uv-install-logs {
+    margin: 0.5rem 0;
+}
+
+.uv-logs-header {
+    cursor: pointer;
+    padding: 0.75rem;
+    border-left: 3px solid var(--border-color);
+    font-family: inherit;
+    font-size: 0.85rem;
+    color: var(--text-secondary);
+    user-select: none;
+}
+
+.uv-logs-content {
+    background: var(--bg-secondary);
+    padding: 1rem;
+    border-left: 3px solid var(--border-color);
+    white-space: pre-wrap;
+    font-family: monospace;
+    font-size: 0.85rem;
+    color: var(--text-secondary);
+    overflow-x: auto;
+}
+
+.cell-artifacts {
+    margin: 1rem 0;
+}
+
+.cell-artifacts h4 {
+    margin: 0 0 0.5rem 0;
+    color: var(--text-secondary);
+    font-size: 0.9rem;
+}
+
+.artifact {
+    display: inline-block;
+    background: var(--bg-artifact);
+    padding: 0.25rem 0.5rem;
+    border-radius: 1px;
+    margin: 0.25rem 0.5rem 0.25rem 0;
+    font-family: inherit;
+    font-size: 0.8rem;
+    color: var(--text-link);
+    text-decoration: none;
+    transition: background-color 0.2s ease;
+    border: 1px solid var(--border-primary);
+}
+
+.artifact:hover {
+    background: var(--bg-artifact-hover);
+}
+
+.artifact-preview {
+    margin-top: 1rem;
+}
+
+.artifact-preview img {
+    max-width: 100%;
+    height: auto;
+    border: 1px solid var(--border-primary);
+    border-radius: 1px;
+}
+
+.artifact-preview svg {
+    max-width: 100%;
+    height: auto;
+    border: 1px solid var(--border-primary);
+    border-radius: 1px;
+    display: block;
+}
+
+/* Style SVG text elements */
+.artifact-preview svg g {
+    fill: var(--text-primary) !important;
+}
+
+/* Auto-theme SVG elements */
+.artifact-preview svg {
+    background: transparent;
+}
+
+/* Invert SVG images in dark mode */
+:root[data-theme="dark"] .artifact-preview img[src$=".svg"] {
+    filter: invert(0.9) hue-rotate(180deg);
+}
+
+/* Keep SVG images readable in monocolor mode */
+:root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] {
+    filter: none;
+}
+
+/* CSV table styling */
+.artifact-csv {
+    margin-top: 1rem;
+    overflow-x: auto;
+}
+
+.csv-table {
+    width: 100%;
+    border-collapse: collapse;
+    font-size: 0.9rem;
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    border-radius: 1px;
+}
+
+.csv-table th,
+.csv-table td {
+    padding: 0.5rem 0.75rem;
+    text-align: left;
+    border: 1px solid var(--border-primary);
+}
+
+.csv-table th {
+    background: var(--bg-tertiary);
+    font-weight: 600;
+    color: var(--text-primary);
+}
+
+.csv-table tbody tr:hover {
+    background: var(--bg-artifact-hover);
+}
+
+.artifact-csv-error {
+    margin-top: 1rem;
+    padding: 1rem;
+    background: var(--bg-error);
+    color: var(--text-error);
+    border: 1px solid var(--border-error);
+    border-radius: 1px;
+}
+
+.cell-failed {
+    border-color: var(--border-cell-failed);
+}
+
+.cell-failed .cell-header {
+    background: var(--bg-error);
+    color: var(--text-error);
+}
+
+.cell-commented {
+    opacity: 0.6;
+    border-style: dashed;
+}
+
+.cell-commented .cell-header {
+    background: var(--bg-secondary);
+    color: var(--text-secondary);
+    font-style: italic;
+}
+
+.run-btn {
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-primary);
+    padding: 2px 6px;
+    border-radius: 2px;
+    color: var(--text-secondary);
+    cursor: pointer;
+    font-size: 0.75em;
+    font-family: inherit;
+    margin-left: 4px;
+}
+
+.run-btn:hover {
+    color: var(--text-primary);
+    background: var(--bg-primary);
+}
+
+.run-btn:disabled {
+    opacity: 0.6;
+    cursor: not-allowed;
+}
+
+.copy-btn {
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-primary);
+    padding: 2px 6px;
+    border-radius: 2px;
+    color: var(--text-secondary);
+    cursor: pointer;
+    font-size: 0.75em;
+    font-family: inherit;
+    margin-left: 4px;
+}
+
+.copy-btn:hover {
+    color: var(--text-primary);
+    background: var(--bg-primary);
+}
+
+.copy-btn:disabled {
+    opacity: 0.6;
+    cursor: not-allowed;
+}
+
+.copy-btn.copied {
+    color: #4caf50;
+    background: var(--bg-primary);
+    border-color: #4caf50;
+    transition: all 0.2s ease;
+}
+
+.raw-btn {
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-primary);
+    padding: 2px 6px;
+    border-radius: 2px;
+    color: var(--text-secondary);
+    cursor: pointer;
+    font-size: 0.75em;
+    font-family: inherit;
+    margin-left: 4px;
+    text-decoration: none;
+    display: inline-block;
+}
+
+.raw-btn:hover {
+    color: var(--text-primary);
+    background: var(--bg-primary);
+    text-decoration: none;
+}
+
+.github-btn {
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-primary);
+    padding: 2px 6px;
+    border-radius: 2px;
+    color: var(--text-secondary);
+    cursor: pointer;
+    font-size: 0.75em;
+    font-family: inherit;
+    margin-left: 4px;
+    text-decoration: none;
+    display: inline-block;
+}
+
+.github-btn:hover {
+    color: var(--text-primary);
+    background: var(--bg-primary);
+    text-decoration: none;
+}
+
+.hf-btn {
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-primary);
+    padding: 2px 6px;
+    border-radius: 2px;
+    color: var(--text-secondary);
+    cursor: pointer;
+    font-size: 0.75em;
+    font-family: inherit;
+    margin-left: 4px;
+    text-decoration: none;
+    display: inline-block;
+}
+
+.hf-btn:hover {
+    color: var(--text-primary);
+    background: var(--bg-primary);
+    text-decoration: none;
+}
+
+.output-stale {
+    opacity: 0.5;
+    position: relative;
+}
+
+.output-stale::after {
+    content: '⏳ updating...';
+    position: absolute;
+    top: 8px;
+    right: 8px;
+    background: var(--bg-secondary);
+    padding: 4px 8px;
+    border-radius: 2px;
+    font-size: 0.75em;
+    color: var(--text-secondary);
+    border: 1px solid var(--border-primary);
+}
+
+h1,
+h2,
+h3,
+h4,
+h5,
+h6 {
+    margin-top: 1.5rem;
+    margin-bottom: 0.75rem;
+    color: var(--text-primary);
+}
+
+h1 {
+    margin-top: 0;
+    margin-bottom: 1rem;
+}
+
+p {
+    margin: 0.75rem 0;
+    color: var(--text-primary);
+}
+
+a {
+    color: var(--text-link);
+}
+
+img {
+    max-width: 100%;
+    height: auto;
+    border-radius: 1px;
+    box-shadow: none;
+}
+
+pre,
+code {
+    font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace;
+    font-size: var(--code-font-size);
+}
+
+.code-wrap {
+    position: relative;
+}
+
+.code-line-highlight {
+    display: none;
+    position: absolute;
+    left: 0;
+    right: 0;
+    height: 1.5em;
+    background: rgba(255, 235, 170, 0.35);
+    pointer-events: none;
+    border-left: 3px solid #f4c542;
+}
+
+.line-number {
+    cursor: pointer;
+    text-decoration: none;
+    color: var(--text-secondary);
+    padding: 0 0.25rem;
+}
+
+.line-number.selected {
+    background: rgba(255, 235, 170, 0.4);
+    color: var(--text-primary);
+}
+
+/* Line numbers */
+.highlight-with-lines {
+    display: flex;
+}
+
+.line-numbers {
+    background: var(--bg-tertiary);
+    padding: var(--code-pad-y) 0.5rem;
+    font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace;
+    font-size: var(--code-font-size);
+    line-height: var(--code-line-height);
+    color: var(--text-secondary);
+    user-select: none;
+    text-align: right;
+    border-right: 1px solid var(--border-primary);
+}
+
+.line-numbers .line-number {
+    display: block;
+    line-height: var(--code-line-height);
+}
+
+.highlight-with-lines .highlight {
+    flex: 1;
+}
+
+.highlight .hll {
+    background-color: transparent;
+}
+
+/* don't conflict with our highlight */
+.highlight pre {
+    white-space: pre;
+    margin: 0;
+    padding: var(--code-pad-y) 0.75rem;
+    line-height: var(--code-line-height);
+}
+
+/* Collapsed code styling */
+.cell-code.collapsed {
+    display: none;
+}
+
+.cell-code.expanded {
+    display: block;
+}
+
+    {
+    % if config.collapse_code %
+}
+
+.cell-code {
+    display: none;
+}
+
+    {
+    % else %
+}
+
+.cell-code {
+    display: block;
+    border-bottom: 1px solid var(--border-primary);
+}
+
+    {
+    % endif %
+}
+
+    {
+        {
+        pygments_css
+    }
+}
+
+/* Ensure our code metrics override Pygments defaults */
+.highlight pre {
+    white-space: pre;
+    margin: 0;
+    padding: var(--code-pad-y) 0.75rem !important;
+    line-height: var(--code-line-height) !important;
+    font-size: var(--code-font-size) !important;
+    font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace !important;
+    border: none;
+}
+
+.line-numbers {
+    line-height: var(--code-line-height) !important;
+}
+
+.line-numbers .line-number {
+    line-height: var(--code-line-height) !important;
+}
+
+/* Custom CSS from frontmatter */
+    {
+        {
+        config.custom_css
+    }
+}
+
+    {
+    # Override code font size from frontmatter (accept number as px) #
+}
+
+    {
+    % if config.code_font_size is not none %
+}
+
+    {
+    % if config.code_font_size is string %
+}
+
+:root {
+    --code-font-size: {
+            {
+            config.code_font_size
+        }
+    }
+
+    ;
+}
+
+    {
+    % else %
+}
+
+:root {
+    --code-font-size: {
+            {
+            config.code_font_size
+        }
+    }
+
+    px;
+}
+
+    {
+    % endif %
+}
+
+    {
+    % endif %
+}
+
+/* Cursor for tools */
+body[data-tool="arrow"] .main-content {
+    cursor: url('data:image/svg+xml;utf8,<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="%23e53935" stroke-width="2"><path d="M12 19l7-7 3 3-7 7-3-3z"/><path d="M18 13l-1.5-7.5L2 2l3.5 14.5L13 18l5-5z"/><path d="M2 2l7.586 7.586"/><circle cx="11" cy="11" r="2"/></svg>') 12 12, crosshair;
+}
+
+body[data-tool="pen"] .main-content {
+    cursor: url('data:image/svg+xml;utf8,<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="%23e53935" stroke-width="2"><path d="M12 19l7-7 3 3-7 7-3-3z"/><path d="M18 13l-1.5-7.5L2 2l3.5 14.5L13 18l5-5z"/><circle cx="4" cy="20" r="2" fill="%23e53935"/></svg>') 4 20, pointer;
+}
+
+body[data-tool="eraser"] .main-content {
+    cursor: url('data:image/svg+xml;utf8,<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="%23e53935" stroke-width="2"><path d="M20 20H7l-7-7 7-7h13v14z"/><path d="M13 13l7-7"/><path d="M13 13L9 9"/></svg>') 12 12, auto;
+}
+
+/* Color picker styles */
+.tools-section-title {
+    font-weight: bold;
+    color: var(--text-secondary);
+    font-size: 0.65rem;
+    margin: 0.75rem 0 0.5rem 0;
+    text-transform: uppercase;
+    letter-spacing: 0.5px;
+}
+
+.color-row {
+    display: grid;
+    grid-template-columns: repeat(6, 1fr);
+    gap: 0.25rem;
+    margin-bottom: 0.5rem;
+}
+
+.color-swatch {
+    width: 18px;
+    height: 18px;
+    border: 2px solid var(--border-primary);
+    border-radius: 3px;
+    cursor: pointer;
+    transition: all 0.2s ease;
+    position: relative;
+}
+
+.color-swatch:hover {
+    transform: scale(1.1);
+    border-color: var(--text-secondary);
+}
+
+.color-swatch.selected {
+    border-color: var(--text-primary);
+    box-shadow: 0 0 0 2px var(--text-link);
+}
+
+.color-swatch.selected::after {
+    content: '✓';
+    position: absolute;
+    top: 50%;
+    left: 50%;
+    transform: translate(-50%, -50%);
+    color: white;
+    font-size: 10px;
+    font-weight: bold;
+    text-shadow: 1px 1px 1px black;
+}
+
+.color-input {
+    width: 24px;
+    height: 24px;
+    border: 2px solid var(--border-primary);
+    border-radius: 3px;
+    cursor: pointer;
+    background: none;
+    padding: 0;
+    grid-column: span 2;
+    justify-self: center;
+}
+
+.color-input:hover {
+    border-color: var(--text-secondary);
+}
+
+/* Thickness slider styles */
+.thickness-row {
+    display: flex;
+    align-items: center;
+    gap: 0.5rem;
+    margin-top: 0.75rem;
+}
+
+.thickness-slider {
+    flex: 1;
+    -webkit-appearance: none;
+    appearance: none;
+    height: 4px;
+    background: var(--border-primary);
+    border-radius: 2px;
+    outline: none;
+    opacity: 0.7;
+    transition: opacity 0.2s;
+}
+
+.thickness-slider:hover {
+    opacity: 1;
+}
+
+.thickness-slider::-webkit-slider-thumb {
+    -webkit-appearance: none;
+    appearance: none;
+    width: 12px;
+    height: 12px;
+    background: var(--text-link);
+    border-radius: 50%;
+    cursor: pointer;
+}
+
+.thickness-slider::-moz-range-thumb {
+    width: 12px;
+    height: 12px;
+    background: var(--text-link);
+    border-radius: 50%;
+    cursor: pointer;
+    border: none;
+}
+
+.thickness-value {
+    font-size: 0.7rem;
+    color: var(--text-secondary);
+    min-width: 20px;
+    text-align: right;
+}
+
+.highlight {
+    background: none !important;
+}
+
+/* Loading animations */
+.loading-spinner {
+    display: inline-block;
+    width: 16px;
+    height: 16px;
+    border: 2px solid var(--border-primary);
+    border-radius: 50%;
+    border-top-color: var(--text-link);
+    animation: spin 1s linear infinite;
+    margin-right: 8px;
+    vertical-align: middle;
+}
+
+@keyframes spin {
+    to {
+        transform: rotate(360deg);
+    }
+}
+
+.loading-skeleton {
+    display: inline-block;
+    background: var(--bg-tertiary);
+    background: linear-gradient(90deg,
+            var(--bg-tertiary) 25%,
+            var(--bg-secondary) 50%,
+            var(--bg-tertiary) 75%);
+    background-size: 200% 100%;
+    animation: loading-shimmer 2s ease-in-out infinite;
+    border-radius: 2px;
+    height: 1em;
+    width: 80px;
+    vertical-align: middle;
+}
+
+@keyframes loading-shimmer {
+    0% {
+        background-position: -200% 0;
+    }
+
+    100% {
+        background-position: 200% 0;
+    }
+}
+
+/* Loading state for cell output */
+.cell-output:has(.loading-spinner) {
+    opacity: 0.7;
+    background: var(--bg-secondary);
+    /* border-left: 3px solid var(--text-link); */
+}
+    </style>
+    <script>
+// --- Drag utilities ---
+        function clamp(val, min, max) { return Math.max(min, Math.min(max, val)); }
+
+        function restorePosition(el, storageKey) {
+            try {
+                const raw = localStorage.getItem(storageKey);
+                if (!raw) return;
+                const pos = JSON.parse(raw);
+                if (typeof pos.left === 'number' && typeof pos.top === 'number') {
+                    el.style.left = pos.left + 'px';
+                    el.style.top = pos.top + 'px';
+                    el.style.right = 'auto';
+                    el.style.bottom = 'auto';
+                }
+            } catch (_) {}
+        }
+
+        function savePosition(el, storageKey) {
+            try {
+                const left = parseFloat(el.style.left || 'NaN');
+                const top = parseFloat(el.style.top || 'NaN');
+                if (!Number.isNaN(left) && !Number.isNaN(top)) {
+                    localStorage.setItem(storageKey, JSON.stringify({ left, top }));
+                }
+            } catch (_) {}
+        }
+
+
+        function makeDraggable(el, storageKey, handleEl) {
+            let dragging = false;
+            let startX = 0, startY = 0; // cursor
+            let origLeft = 0, origTop = 0; // element
+
+            const onMove = (e) => {
+                if (!dragging) return;
+                const clientX = e.touches ? e.touches[0].clientX : e.clientX;
+                const clientY = e.touches ? e.touches[0].clientY : e.clientY;
+                const dx = clientX - startX;
+                const dy = clientY - startY;
+                const w = el.offsetWidth;
+                const h = el.offsetHeight;
+                const maxX = window.innerWidth - w;
+                const maxY = window.innerHeight - h;
+                const newLeft = clamp(origLeft + dx, 0, maxX);
+                const newTop = clamp(origTop + dy, 0, maxY);
+                el.style.left = newLeft + 'px';
+                el.style.top = newTop + 'px';
+                el.style.right = 'auto';
+                el.style.bottom = 'auto';
+            };
+
+            const endDrag = () => {
+                if (!dragging) return;
+                dragging = false;
+                document.removeEventListener('mousemove', onMove);
+                document.removeEventListener('mouseup', endDrag);
+                document.removeEventListener('touchmove', onMove);
+                document.removeEventListener('touchend', endDrag);
+                handleEl && (handleEl.style.cursor = 'grab');
+                savePosition(el, storageKey);
+                // ensure no-overlap constraint after a drag
+                try { layoutWidgetsStackedBottomRight(); } catch (_) {}
+            };
+
+            const startDrag = (e) => {
+                // Start from element's current on-screen rect
+                const elRect = el.getBoundingClientRect();
+                el.style.left = elRect.left + 'px';
+                el.style.top = elRect.top + 'px';
+                el.style.right = 'auto';
+                el.style.bottom = 'auto';
+
+                dragging = true;
+                startX = e.touches ? e.touches[0].clientX : e.clientX;
+                startY = e.touches ? e.touches[0].clientY : e.clientY;
+                origLeft = elRect.left;
+                origTop = elRect.top;
+
+                document.addEventListener('mousemove', onMove);
+                document.addEventListener('mouseup', endDrag);
+                document.addEventListener('touchmove', onMove, { passive: false });
+                document.addEventListener('touchend', endDrag);
+                handleEl && (handleEl.style.cursor = 'grabbing');
+                e.preventDefault();
+            };
+
+            (handleEl || el).addEventListener('mousedown', startDrag);
+            (handleEl || el).addEventListener('touchstart', startDrag, { passive: false });
+
+            // Apply any saved position on init
+            restorePosition(el, storageKey);
+        }
+        function toggleCell(cellId) {
+            const codeElement = document.getElementById('code-' + cellId);
+            const outputElement = document.getElementById('output-' + cellId);
+            
+            if (codeElement) {
+                codeElement.classList.toggle('collapsed');
+            }
+            if (outputElement) {
+                outputElement.classList.toggle('collapsed');
+            }
+            
+            updateIndicators(cellId);
+            encodeToolStateToUrl();
+        }
+        
+        function toggleCode(cellId) {
+            const codeElement = document.getElementById('code-' + cellId);
+            if (codeElement) {
+                codeElement.classList.toggle('collapsed');
+                updateIndicators(cellId);
+                encodeToolStateToUrl();
+            }
+        }
+        
+        function toggleOutput(cellId) {
+            const outputElement = document.getElementById('output-' + cellId);
+            if (outputElement) {
+                outputElement.classList.toggle('collapsed');
+                updateIndicators(cellId);
+                encodeToolStateToUrl();
+            }
+        }
+
+        function toggleUvLogs(headerElement) {
+            const contentElement = headerElement.nextElementSibling;
+            if (contentElement) {
+                const isCollapsed = contentElement.style.display === 'none';
+                contentElement.style.display = isCollapsed ? 'block' : 'none';
+                headerElement.textContent = isCollapsed ? '▼ UV Install Logs' : '▶ UV Install Logs';
+
+                // Update the header indicator if it exists
+                const uvLogsDiv = headerElement.parentElement;
+                if (uvLogsDiv && uvLogsDiv.id && uvLogsDiv.id.startsWith('uv-logs-')) {
+                    const cellId = uvLogsDiv.id.replace('uv-logs-', '');
+                    const indicatorElement = document.getElementById('uv-indicator-' + cellId);
+                    if (indicatorElement) {
+                        indicatorElement.textContent = isCollapsed ? '▼ uv-logs' : '▶ uv-logs';
+                    }
+                }
+            }
+        }
+
+        function toggleUvLogsFromHeader(cellId) {
+            const uvLogsElement = document.getElementById('uv-logs-' + cellId);
+            const indicatorElement = document.getElementById('uv-indicator-' + cellId);
+            if (uvLogsElement) {
+                const headerElement = uvLogsElement.querySelector('.uv-logs-header');
+                const contentElement = uvLogsElement.querySelector('.uv-logs-content');
+                if (contentElement && headerElement) {
+                    const isCollapsed = contentElement.style.display === 'none';
+                    contentElement.style.display = isCollapsed ? 'block' : 'none';
+                    headerElement.textContent = isCollapsed ? '▼ UV Install Logs' : '▶ UV Install Logs';
+                    if (indicatorElement) {
+                        indicatorElement.textContent = isCollapsed ? '▼ uv-logs' : '▶ uv-logs';
+                    }
+                }
+            }
+        }
+        
+        function updateIndicators(cellId) {
+            const codeElement = document.getElementById('code-' + cellId);
+            const outputElement = document.getElementById('output-' + cellId);
+            const indicators = document.querySelector(`[onclick*="${cellId}"]`)?.closest('.cell-header')?.querySelector('.collapse-indicators');
+            
+            if (indicators) {
+                const codeCollapsed = codeElement && codeElement.classList.contains('collapsed');
+                const outputCollapsed = outputElement && outputElement.classList.contains('collapsed');
+                
+                const codeIcon = codeCollapsed ? '▶' : '▼';
+                const outputIcon = outputCollapsed ? '▶' : '▼';
+                
+                const codeSpan = indicators.querySelector('[onclick*="toggleCode"]');
+                const outputSpan = indicators.querySelector('[onclick*="toggleOutput"]');
+                
+                if (codeSpan) codeSpan.innerHTML = `${codeIcon} code`;
+                if (outputSpan) outputSpan.innerHTML = `${outputIcon} output`;
+            }
+        }
+        
+        function toggleTheme() {
+            const html = document.documentElement;
+            const currentTheme = html.getAttribute('data-theme');
+            const newTheme = currentTheme === 'dark' ? 'light' : 'dark';
+            html.setAttribute('data-theme', newTheme);
+            localStorage.setItem('uvnote-theme', newTheme);
+            updateThemeIcon();
+            updateUiDebug();
+        }
+        
+        // Two panel code removed
+        
+        function updateThemeIcon() {
+            const theme = document.documentElement.getAttribute('data-theme');
+            const toggle = document.querySelector('.theme-toggle');
+            if (toggle) {
+                toggle.textContent = theme === 'dark' ? 'light' : 'dark';
+            }
+        }
+        function setUiTheme(newUi) {
+            if (newUi !== 'default' && newUi !== 'none' && newUi !== 'monocolor') return;
+            const html = document.documentElement;
+            html.setAttribute('data-ui', newUi);
+            try { localStorage.setItem('uvnote-ui', newUi); } catch (_) {}
+            updateUiMenu();
+            updateUiDebug();
+        }
+        function updateUiMenu() {
+            const ui = document.documentElement.getAttribute('data-ui') || 'default';
+            const checks = {
+                default: document.getElementById('checkbox-ui-default'),
+                none: document.getElementById('checkbox-ui-none'),
+                monocolor: document.getElementById('checkbox-ui-monocolor')
+            };
+            if (checks.default) checks.default.textContent = ui === 'default' ? '☑' : '☐';
+            if (checks.none) checks.none.textContent = ui === 'none' ? '☑' : '☐';
+            if (checks.monocolor) checks.monocolor.textContent = ui === 'monocolor' ? '☑' : '☐';
+        }
+
+        function updateUiDebug() {
+            const ui = document.documentElement.getAttribute('data-ui') || 'default';
+            const color = document.documentElement.getAttribute('data-theme') || 'light';
+            const el = document.getElementById('ui-debug');
+            if (el) {
+                el.textContent = `UI: ${ui} | Color: ${color}`;
+            }
+        }
+
+        // Line selection and deep-linking
+        function clearLineSelections() {
+            try {
+                document.querySelectorAll('.code-line-highlight').forEach(el => { el.style.display = 'none'; });
+                document.querySelectorAll('.line-number.selected').forEach(el => el.classList.remove('selected'));
+            } catch (_) {}
+        }
+
+        let _selection = null; // { cellId, a, b }
+
+        function clearSelection(updateUrl) {
+            clearLineSelections();
+            _selection = null;
+            if (updateUrl) {
+                try {
+                    const url = new URL(window.location.href);
+                    url.searchParams.delete('cell');
+                    url.searchParams.delete('line');
+                    history.replaceState(null, '', url.toString());
+                } catch (_) {}
+            }
+            updateStateIndicator();
+        }
+
+        function selectCellLine(cellId, line, updateUrl) {
+            try {
+                // Ensure only one selection across the whole document
+                clearLineSelections();
+                const codeBox = document.getElementById(`code-${cellId}`);
+                if (!codeBox) return;
+                const pre = codeBox.querySelector('.highlight pre');
+                const overlay = document.getElementById(`line-highlight-${cellId}`);
+                const numbers = document.getElementById(`lines-${cellId}`);
+                if (!pre || !overlay) return;
+
+                // Measure line height directly from computed style
+                const preStyle = getComputedStyle(pre);
+                const padTop = parseFloat(preStyle.paddingTop || '0');
+                const lh = parseFloat(preStyle.lineHeight || '20');
+
+                // Position overlay
+                overlay.style.display = 'block';
+                overlay.style.height = `${lh}px`;
+                overlay.style.top = `${pre.offsetTop + padTop + (line - 1) * lh}px`;
+
+                // Update selected class in line numbers
+                if (numbers) {
+                    numbers.querySelectorAll('.line-number').forEach(a => a.classList.remove('selected'));
+                    const sel = numbers.querySelector(`.line-number[data-line="${line}"]`);
+                    if (sel) sel.classList.add('selected');
+                }
+
+                if (updateUrl) {
+                    const url = new URL(window.location.href);
+                    url.searchParams.set('cell', cellId);
+                    url.searchParams.set('line', String(line));
+                    history.replaceState(null, '', url.toString());
+                }
+                _selection = { cellId, a: line, b: line };
+                updateStateIndicator();
+            } catch (e) { console.warn('selectCellLine error', e); }
+        }
+
+        function selectCellLines(cellId, startLine, endLine, updateUrl) {
+            try {
+                // normalize order
+                const a = Math.min(startLine, endLine);
+                const b = Math.max(startLine, endLine);
+                clearLineSelections();
+                const codeBox = document.getElementById(`code-${cellId}`);
+                if (!codeBox) return;
+                const pre = codeBox.querySelector('.highlight pre');
+                const overlay = document.getElementById(`line-highlight-${cellId}`);
+                const numbers = document.getElementById(`lines-${cellId}`);
+                if (!pre || !overlay) return;
+
+                const preStyle = getComputedStyle(pre);
+                const padTop = parseFloat(preStyle.paddingTop || '0');
+                const lh = parseFloat(preStyle.lineHeight || '20');
+
+                overlay.style.display = 'block';
+                overlay.style.top = `${pre.offsetTop + padTop + (a - 1) * lh}px`;
+                overlay.style.height = `${(b - a + 1) * lh}px`;
+
+                if (numbers) {
+                    numbers.querySelectorAll('.line-number').forEach(a => a.classList.remove('selected'));
+                    for (let i = a; i <= b; i++) {
+                        const el = numbers.querySelector(`.line-number[data-line="${i}"]`);
+                        if (el) el.classList.add('selected');
+                    }
+                }
+
+                if (updateUrl) {
+                    const url = new URL(window.location.href);
+                    url.searchParams.set('cell', cellId);
+                    if (a === b) url.searchParams.set('line', String(a));
+                    else url.searchParams.set('line', `${a}-${b}`);
+                    history.replaceState(null, '', url.toString());
+                }
+                _selection = { cellId, a, b };
+                updateStateIndicator();
+            } catch (e) { console.warn('selectCellLines error', e); }
+        }
+
+        // Drag-to-select support on line numbers
+        let _lineDrag = { active: false, cellId: null, start: 0 };
+        function onLineNumberMouseDown(e) {
+            const a = e.target.closest('.line-number');
+            if (!a) return;
+            e.preventDefault();
+            const cellId = a.dataset.cell;
+            const line = parseInt(a.dataset.line || '1', 10) || 1;
+            // Toggle off if this exact single line is already the only selection
+            const numbers = document.getElementById(`lines-${cellId}`);
+            if (numbers) {
+                const selected = Array.from(numbers.querySelectorAll('.line-number.selected')).map(n => parseInt(n.dataset.line||'0',10)).filter(Boolean);
+                if (selected.length === 1 && selected[0] === line) {
+                    clearSelection(true);
+                    return;
+                }
+            }
+            _lineDrag.active = true;
+            _lineDrag.cellId = cellId;
+            _lineDrag.start = line;
+            selectCellLines(_lineDrag.cellId, _lineDrag.start, _lineDrag.start, false);
+        }
+        function onDocMouseMove(e) {
+            if (!_lineDrag.active) return;
+            const el = document.elementFromPoint(e.clientX, e.clientY);
+            if (!el) return;
+            const a = el.closest && el.closest('.line-number');
+            if (!a) return;
+            if (a.dataset.cell !== _lineDrag.cellId) return;
+            const cur = parseInt(a.dataset.line || '1', 10) || 1;
+            selectCellLines(_lineDrag.cellId, _lineDrag.start, cur, false);
+        }
+        function onDocMouseUp(e) {
+            if (!_lineDrag.active) return;
+            const last = document.querySelector('.line-number.selected:last-of-type');
+            // finalize URL using the current selected range
+            const numbers = document.getElementById(`lines-${_lineDrag.cellId}`);
+            if (numbers) {
+                const selected = Array.from(numbers.querySelectorAll('.line-number.selected')).map(n => parseInt(n.dataset.line||'0',10)).filter(Boolean);
+                if (selected.length) {
+                    const a = Math.min(...selected); const b = Math.max(...selected);
+                    selectCellLines(_lineDrag.cellId, a, b, true);
+                }
+            }
+            _lineDrag.active = false; _lineDrag.cellId = null; _lineDrag.start = 0;
+        }
+
+        function applyLocationFromUrl() {
+            try {
+                const url = new URL(window.location.href);
+                const cell = url.searchParams.get('cell');
+                const lineParam = url.searchParams.get('line');
+                if (cell && lineParam) {
+                    if (lineParam.includes('-')) {
+                        const [a, b] = lineParam.split('-').map(x => parseInt(x, 10));
+                        if (!Number.isNaN(a) && !Number.isNaN(b)) selectCellLines(cell, a, b, false);
+                    } else {
+                        const l = parseInt(lineParam, 10);
+                        if (!Number.isNaN(l)) selectCellLine(cell, l, false);
+                    }
+                }
+                
+                // Apply tool parameters from URL
+                applyToolsFromUrl(url.searchParams);
+                
+                // Cell states will be applied later in DOMContentLoaded with proper timing
+                const encodedCellStates = url.searchParams.get('cells');
+                console.log('Encoded cell states from URL:', encodedCellStates);
+            } catch (_) {}
+        }
+        
+        function applyToolsFromUrl(params) {
+            try {
+                // Check if tools widget should be shown
+                const showTools = params.get('tools');
+                if (showTools === '1') {
+                    // Mark that tool was loaded from URL
+                    _urlLoadedTool = true;
+                    
+                    // Apply color
+                    const color = params.get('color');
+                    if (color && /^[0-9a-fA-F]{6}$/.test(color)) {
+                        setStoredArrowColor('#' + color);
+                    }
+                    
+                    // Apply thickness
+                    const thickness = params.get('thickness');
+                    if (thickness) {
+                        const value = parseInt(thickness, 10);
+                        if (value >= 1 && value <= 10) {
+                            setStoredLineThickness(value);
+                        }
+                    }
+                    
+                    // Don't override fadeout time for URL-loaded tools - let individual shapes decide
+                    
+                    // Load shapes from URL
+                    const encodedShapes = params.get('shapes');
+                    if (encodedShapes) {
+                        const decodedShapes = decodeShapesFromUrl(encodedShapes);
+                        if (decodedShapes.length > 0) {
+                            _shapes = decodedShapes;
+                            saveShapes();
+                            // Trigger render after overlay is initialized
+                            setTimeout(() => {
+                                renderOverlay();
+                            }, 300);
+                        }
+                    }
+                    
+                    // Wait for widgets to be initialized before showing tools
+                    setTimeout(() => {
+                        const toolsWidget = document.querySelector('.tools-widget');
+                        const checkbox = document.getElementById('checkbox-tools');
+                        if (toolsWidget && checkbox) {
+                            toolsWidget.style.display = 'block';
+                            checkbox.textContent = '☑';
+                            localStorage.setItem('uvnote-widget-tools', 'visible');
+                        }
+                        
+                        // Apply active tool
+                        const activeTool = params.get('tool');
+                        if (activeTool && ['arrow', 'pen', 'eraser', 'spotlight'].includes(activeTool)) {
+                            const toolBtn = Array.from(document.querySelectorAll('.tool-button')).find(btn => btn.textContent === activeTool);
+                            if (toolBtn) {
+                                toolBtn.click();
+                            }
+                        }
+                        
+                        // Re-layout widgets after showing tools
+                        layoutWidgetsStackedBottomRight();
+                    }, 200);
+                }
+            } catch (_) {}
+        }
+        
+        function captureInitialCellStates() {
+            const cells = document.querySelectorAll('.cell');
+            cells.forEach(cell => {
+                const cellId = cell.id.replace('cell-', '');
+                const codeEl = document.getElementById('code-' + cellId);
+                const outputEl = document.getElementById('output-' + cellId);
+                
+                if (codeEl || outputEl) {
+                    const state = {};
+                    if (codeEl) {
+                        state.c = codeEl.classList.contains('collapsed') ? 0 : 1;
+                    }
+                    if (outputEl) {
+                        state.o = outputEl.classList.contains('collapsed') ? 0 : 1;
+                    }
+                    _initialCellStates[cellId] = state;
+                }
+            });
+            console.log('Captured initial cell states:', _initialCellStates);
+        }
+
+        function encodeCellStatesToUrl() {
+            // Get all cells and their collapse states
+            const cells = document.querySelectorAll('.cell');
+            const cellStates = {};
+            
+            console.log('Found cells:', cells.length);
+            
+            cells.forEach(cell => {
+                const cellId = cell.id.replace('cell-', '');
+                const codeEl = document.getElementById('code-' + cellId);
+                const outputEl = document.getElementById('output-' + cellId);
+                const initialState = _initialCellStates[cellId] || {};
+                
+                console.log(`Encoding cell ${cellId}:`, {
+                    codeEl: !!codeEl,
+                    outputEl: !!outputEl,
+                    codeCollapsed: codeEl ? codeEl.classList.contains('collapsed') : 'N/A',
+                    outputCollapsed: outputEl ? outputEl.classList.contains('collapsed') : 'N/A',
+                    initialState: initialState
+                });
+                
+                if (codeEl || outputEl) {
+                    const state = {};
+                    let hasChanges = false;
+                    
+                    if (codeEl) {
+                        const currentCodeState = codeEl.classList.contains('collapsed') ? 0 : 1;
+                        const initialCodeState = initialState.c;
+                        // Only encode if different from initial state
+                        if (initialCodeState !== undefined && currentCodeState !== initialCodeState) {
+                            state.c = currentCodeState;
+                            hasChanges = true;
+                        }
+                    }
+                    
+                    if (outputEl) {
+                        const currentOutputState = outputEl.classList.contains('collapsed') ? 0 : 1;
+                        const initialOutputState = initialState.o;
+                        // Only encode if different from initial state
+                        if (initialOutputState !== undefined && currentOutputState !== initialOutputState) {
+                            state.o = currentOutputState;
+                            hasChanges = true;
+                        }
+                    }
+                    
+                    // Only include cell if it has changes from initial state
+                    if (hasChanges) {
+                        cellStates[cellId] = state;
+                        console.log(`Added cell ${cellId}:`, state);
+                    }
+                }
+            });
+            
+            console.log('Final cell states to encode:', cellStates);
+            
+            // Return empty string if no changed cells
+            if (Object.keys(cellStates).length === 0) return '';
+            
+            // Encode as compact base64 string
+            const encoded = btoa(JSON.stringify(cellStates));
+            console.log('Encoded cell states:', encoded);
+            return encoded;
+        }
+        
+        function decodeCellStatesFromUrl(encodedStates) {
+            if (!encodedStates) return {};
+            
+            try {
+                return JSON.parse(atob(encodedStates));
+            } catch (e) {
+                console.error('Failed to decode cell states:', e);
+                return {};
+            }
+        }
+        
+        function applyCellStatesFromUrl(cellStates) {
+            console.log('Applying cell states from URL:', cellStates);
+            Object.entries(cellStates).forEach(([cellId, state]) => {
+                const codeEl = document.getElementById('code-' + cellId);
+                const outputEl = document.getElementById('output-' + cellId);
+                
+                console.log(`Cell ${cellId}:`, {
+                    codeEl: !!codeEl,
+                    outputEl: !!outputEl,
+                    state: state
+                });
+                
+                if (codeEl && state.c !== undefined) {
+                    if (state.c === 0) {
+                        codeEl.classList.add('collapsed');
+                        console.log(`Collapsed code for cell ${cellId}`, {
+                            hasCollapsedClass: codeEl.classList.contains('collapsed'),
+                            computedDisplay: getComputedStyle(codeEl).display,
+                            classList: Array.from(codeEl.classList),
+                            elementId: codeEl.id
+                        });
+                    } else {
+                        codeEl.classList.remove('collapsed');
+                        codeEl.classList.add('expanded'); // Explicitly add expanded class
+                        console.log(`Expanded code for cell ${cellId}`, {
+                            hasCollapsedClass: codeEl.classList.contains('collapsed'),
+                            hasExpandedClass: codeEl.classList.contains('expanded'),
+                            computedDisplay: getComputedStyle(codeEl).display,
+                            classList: Array.from(codeEl.classList),
+                            elementId: codeEl.id
+                        });
+                    }
+                }
+                
+                if (outputEl && state.o !== undefined) {
+                    if (state.o === 0) {
+                        outputEl.classList.add('collapsed');
+                        console.log(`Collapsed output for cell ${cellId}`);
+                    } else {
+                        outputEl.classList.remove('collapsed');
+                        console.log(`Expanded output for cell ${cellId}`);
+                    }
+                }
+                
+                // Update visual indicators and force style recalculation
+                try {
+                    updateIndicators(cellId);
+                    // Force browser to recalculate styles
+                    if (codeEl) {
+                        codeEl.offsetHeight; // Force reflow
+                        console.log(`After indicators update - code visible: ${getComputedStyle(codeEl).display !== 'none'}`);
+                    }
+                    if (outputEl) {
+                        outputEl.offsetHeight; // Force reflow
+                        console.log(`After indicators update - output visible: ${getComputedStyle(outputEl).display !== 'none'}`);
+                    }
+                } catch (e) {
+                    console.error(`Error updating indicators for cell ${cellId}:`, e);
+                }
+            });
+        }
+
+        function encodeShapesToUrl() {
+            // Encode shapes as compact base64 string
+            if (_shapes.length === 0) return '';
+            
+            const shapeData = _shapes.map(shape => {
+                const baseData = {
+                    ct: shape.createdAt, // creation timestamp
+                    fo: shape.fadeoutTime || getFadeoutTime() // fadeout time for this shape
+                };
+                
+                if (shape.type === 'arrow') {
+                    return {
+                        ...baseData,
+                        t: 'a',
+                        x1: Math.round(shape.x1),
+                        y1: Math.round(shape.y1),
+                        x2: Math.round(shape.x2),
+                        y2: Math.round(shape.y2),
+                        c: shape.color.substring(1), // remove #
+                        w: shape.width
+                    };
+                } else if (shape.type === 'pen') {
+                    return {
+                        ...baseData,
+                        t: 'p',
+                        pts: shape.points.map(p => [Math.round(p.x), Math.round(p.y)]),
+                        c: shape.color.substring(1),
+                        w: shape.width
+                    };
+                } else if (shape.type === 'spotlight') {
+                    return {
+                        ...baseData,
+                        t: 's',
+                        x: Math.round(shape.x),
+                        y: Math.round(shape.y),
+                        r: Math.round(shape.radius)
+                    };
+                }
+            }).filter(Boolean);
+            
+            return btoa(JSON.stringify(shapeData));
+        }
+        
+        function decodeShapesFromUrl(encodedShapes) {
+            if (!encodedShapes) return [];
+            
+            try {
+                const shapeData = JSON.parse(atob(encodedShapes));
+                return shapeData.map(data => {
+                    const base = {
+                        createdAt: data.ct || Date.now(), // use encoded timestamp or current time
+                        fadeoutTime: data.fo || 0, // use encoded fadeout time or 0 (never fade)
+                        opacity: 1.0
+                    };
+                    
+                    if (data.t === 'a') {
+                        return {
+                            ...base,
+                            type: 'arrow',
+                            x1: data.x1,
+                            y1: data.y1,
+                            x2: data.x2,
+                            y2: data.y2,
+                            color: '#' + data.c,
+                            width: data.w
+                        };
+                    } else if (data.t === 'p') {
+                        return {
+                            ...base,
+                            type: 'pen',
+                            points: data.pts.map(([x, y]) => ({ x, y })),
+                            color: '#' + data.c,
+                            width: data.w
+                        };
+                    } else if (data.t === 's') {
+                        return {
+                            ...base,
+                            type: 'spotlight',
+                            x: data.x,
+                            y: data.y,
+                            radius: data.r,
+                            color: '#000000'
+                        };
+                    }
+                }).filter(Boolean);
+            } catch (e) {
+                console.error('Failed to decode shapes:', e);
+                return [];
+            }
+        }
+        
+        function encodeToolStateToUrl() {
+            // Don't update URL during initialization
+            if (_isInitializing) {
+                return window.location.href;
+            }
+            
+            const params = new URLSearchParams(window.location.search);
+            
+            // Check if tools widget is visible and has an active tool
+            const toolsWidget = document.querySelector('.tools-widget');
+            const activeTool = document.body.dataset.tool;
+            const hasActiveTool = activeTool && activeTool !== 'none';
+            const toolsWidgetVisible = toolsWidget && getComputedStyle(toolsWidget).display !== 'none';
+            
+            // Always handle shapes regardless of tool state
+            const encodedShapes = encodeShapesToUrl();
+            if (encodedShapes) {
+                params.set('shapes', encodedShapes);
+            } else {
+                params.delete('shapes');
+            }
+            
+            // Always preserve existing cell states from URL if present
+            const existingCellStates = params.get('cells');
+            if (existingCellStates) {
+                // Keep existing cell states - don't re-encode from DOM
+                params.set('cells', existingCellStates);
+            } else {
+                // Only encode new cell states if none exist in URL
+                const encodedCellStates = encodeCellStatesToUrl();
+                if (encodedCellStates) {
+                    params.set('cells', encodedCellStates);
+                }
+            }
+            
+            if (toolsWidgetVisible && hasActiveTool) {
+                // Include tool params when widget is visible AND tool is active
+                params.set('tools', '1');
+                params.set('tool', activeTool);
+                
+                // Get color (without # prefix)
+                const color = getArrowColor();
+                if (color && color.startsWith('#')) {
+                    params.set('color', color.substring(1));
+                }
+                
+                // Get thickness
+                const thickness = getLineThickness();
+                params.set('thickness', thickness.toString());
+            } else {
+                // Remove tool state params but keep shapes
+                params.delete('tools');
+                params.delete('tool');
+                params.delete('color');
+                params.delete('thickness');
+                params.delete('fadeout');
+            }
+            
+            // Update URL without reloading
+            const newUrl = window.location.pathname + (params.toString() ? '?' + params.toString() : '') + window.location.hash;
+            window.history.replaceState(null, '', newUrl);
+            
+            return window.location.href;
+        }
+
+        function resetLayout() {
+            try {
+                // Clear all uvnote-* keys
+                const allKeys = Object.keys(localStorage);
+                const uvnoteKeys = allKeys.filter(key => key.startsWith('uvnote-'));
+                uvnoteKeys.forEach(k => localStorage.removeItem(k));
+            } catch (_) {}
+            
+            // Clear any active selection and remove URL params
+            try { clearSelection(true); } catch(_) {}
+            // Reset active tool if any
+            try { window.setActiveTool('none'); } catch(_) {}
+            // Clear shapes
+            try { _shapes = []; saveShapes(); } catch(_) {}
+            // Reset URL-loaded tool flag
+            try { _urlLoadedTool = false; } catch(_) {}
+            // Reset all cells to expanded state
+            try {
+                const cells = document.querySelectorAll('.cell');
+                cells.forEach(cell => {
+                    const cellId = cell.id.replace('cell-', '');
+                    const codeEl = document.getElementById('code-' + cellId);
+                    const outputEl = document.getElementById('output-' + cellId);
+                    if (codeEl) codeEl.classList.remove('collapsed');
+                    if (outputEl) outputEl.classList.remove('collapsed');
+                    updateIndicators(cellId);
+                });
+            } catch(_) {}
+            
+            // Clear ALL URL parameters and reload with clean URL
+            try {
+                const cleanUrl = window.location.pathname + window.location.hash;
+                window.location.href = cleanUrl; // Use window.location.href instead of history.replaceState + reload
+            } catch (_) {
+                // Fallback - reload current page
+                location.reload();
+            }
+        }
+
+        function toggleMenu() {
+            const menuButton = document.querySelector('.menu-button');
+            if (menuButton) {
+                menuButton.classList.toggle('active');
+            }
+        }
+
+        function toggleWidget(widgetName) {
+            let widget;
+            let checkbox;
+            
+            // Close the menu first
+            const menuButton = document.querySelector('.menu-button');
+            if (menuButton) {
+                menuButton.classList.remove('active');
+            }
+            
+            switch(widgetName) {
+                case 'tools':
+                    widget = document.querySelector('.tools-widget');
+                    checkbox = document.getElementById('checkbox-tools');
+                    break;
+                case 'file-explorer':
+                    widget = document.querySelector('.file-explorer');
+                    checkbox = document.getElementById('checkbox-file-explorer');
+                    break;
+                case 'minimap':
+                    widget = document.querySelector('.minimap');
+                    checkbox = document.getElementById('checkbox-minimap');
+                    break;
+                case 'status':
+                    widget = document.querySelector('.status-widget');
+                    checkbox = document.getElementById('checkbox-status');
+                    break;
+                default:
+                    return;
+            }
+            
+            if (widget && checkbox) {
+                const isVisible = getComputedStyle(widget).display !== 'none';
+                widget.style.display = isVisible ? 'none' : 'block';
+                checkbox.textContent = isVisible ? '☐' : '☑';
+                
+                // Save state to localStorage
+                try {
+                    localStorage.setItem(`uvnote-widget-${widgetName}`, isVisible ? 'hidden' : 'visible');
+                } catch (_) {}
+                
+                // Re-layout widgets after visibility change
+                try { 
+                    layoutWidgetsStackedBottomRight(); 
+                } catch (_) {}
+                
+                // Update URL when tools widget visibility changes
+                if (widgetName === 'tools') {
+                    encodeToolStateToUrl();
+                }
+            }
+        }
+        
+        function initializeWidgetVisibility() {
+            const widgets = [
+                { name: 'tools', selector: '.tools-widget' },
+                { name: 'file-explorer', selector: '.file-explorer' },
+                { name: 'minimap', selector: '.minimap' },
+                { name: 'status', selector: '.status-widget' }
+            ];
+            
+            widgets.forEach(({ name, selector }) => {
+                const defaultState = name === 'status' ? 'visible' : 'hidden';
+                const savedState = localStorage.getItem(`uvnote-widget-${name}`) || defaultState;
+                const widget = document.querySelector(selector);
+                const checkbox = document.getElementById(`checkbox-${name}`);
+                
+                if (widget && checkbox) {
+                    const isVisible = savedState === 'visible';
+                    widget.style.display = isVisible ? 'block' : 'none';
+                    checkbox.textContent = isVisible ? '☑' : '☐';
+                }
+            });
+        }
+
+        // Close menu when clicking outside
+        document.addEventListener('click', function(event) {
+            const menuButton = document.querySelector('.menu-button');
+            // Don't close if clicking on a menu item (let the item handler close it)
+            if (menuButton && !menuButton.contains(event.target)) {
+                menuButton.classList.remove('active');
+            }
+        });
+
+        // Layout: stack widgets bottom-right and equalize widths
+        function hasCustomWidgetPositions() {
+            try {
+                return (
+                    localStorage.getItem('uvnote-minimap-pos') ||
+                    localStorage.getItem('uvnote-file-explorer-pos') ||
+                    localStorage.getItem('uvnote-tools-pos')
+                );
+            } catch (_) { return false; }
+        }
+
+        function rectsOverlap(r1, r2) {
+            return !(r1.right <= r2.left || r2.right <= r1.left || r1.bottom <= r2.top || r2.bottom <= r1.top);
+        }
+
+        function widgetsOverlap(widgets) {
+            for (let i = 0; i < widgets.length; i++) {
+                const a = widgets[i];
+                const ra = a.getBoundingClientRect();
+                for (let j = i + 1; j < widgets.length; j++) {
+                    const b = widgets[j];
+                    const rb = b.getBoundingClientRect();
+                    if (rectsOverlap(ra, rb)) return true;
+                }
+            }
+            return false;
+        }
+
+        function applyStackLayout(widgets, order) {
+            if (!widgets.length) return;
+            // Fixed equal width
+            const fixedWidth = 220;
+            widgets.forEach(el => { el.style.width = fixedWidth + 'px'; });
+
+            // Fit heights if needed to avoid overflow
+            const gap = 12;
+            const available = Math.max(0, window.innerHeight - 40 - gap * (order.length - 1));
+            const eachMax = Math.floor(available / order.length);
+            order.forEach(el => {
+                el.style.maxHeight = eachMax + 'px';
+                el.style.overflowY = 'auto';
+            });
+
+            // Stack bottom-up in the requested order
+            let bottomOffset = 20; // base gutter
+            order.forEach(el => {
+                el.style.left = 'auto';
+                el.style.top = 'auto';
+                el.style.right = '20px';
+                el.style.bottom = bottomOffset + 'px';
+                bottomOffset += el.offsetHeight + gap;
+            });
+        }
+
+        function layoutWidgetsStackedBottomRight() {
+            const minimap = document.querySelector('.minimap');
+            const fileExplorer = document.querySelector('.file-explorer');
+            const tools = document.querySelector('.tools-widget');
+            const status = document.querySelector('.status-widget');
+            const widgets = [minimap, fileExplorer, tools, status].filter(el => el && getComputedStyle(el).display !== 'none');
+            if (!widgets.length) return;
+
+            const order = [minimap, fileExplorer, tools, status].filter(Boolean).filter(el => getComputedStyle(el).display !== 'none');
+
+            // If user placed custom positions and there is no overlap, respect them.
+            if (hasCustomWidgetPositions() && !widgetsOverlap(widgets)) return;
+
+            applyStackLayout(widgets, order);
+        }
+        
+        // Panel icon removed
+        
+        let _minimapScrollContainer = null;
+        let _minimapScrollHandler = null;
+        function initMinimap() {
+            // Generate minimap content
+            const minimap = createMinimap();
+            document.body.appendChild(minimap);
+            // Make draggable (use title as handle)
+            const mTitle = minimap.querySelector('.minimap-title');
+            makeDraggable(minimap, 'uvnote-minimap-pos', mTitle);
+
+            // Attach scroll listener to window (two-panel removed)
+            _minimapScrollContainer = window;
+
+            if (_minimapScrollContainer) {
+                _minimapScrollHandler = () => updateMinimapActive();
+                if (_minimapScrollContainer === window) {
+                    window.addEventListener('scroll', _minimapScrollHandler);
+                } else {
+                    _minimapScrollContainer.addEventListener('scroll', _minimapScrollHandler);
+                }
+            }
+            updateMinimapActive();
+        }
+
+        function teardownMinimap() {
+            const minimap = document.querySelector('.minimap');
+            if (minimap && minimap.parentNode) minimap.parentNode.removeChild(minimap);
+            if (_minimapScrollContainer && _minimapScrollHandler) {
+                if (_minimapScrollContainer === window) {
+                    window.removeEventListener('scroll', _minimapScrollHandler);
+                } else {
+                    _minimapScrollContainer.removeEventListener('scroll', _minimapScrollHandler);
+                }
+            }
+            _minimapScrollContainer = null;
+            _minimapScrollHandler = null;
+        }
+        
+        function initFileExplorer() {
+            // Generate file explorer content
+            const fileExplorer = createFileExplorer();
+            document.body.appendChild(fileExplorer);
+        }
+        
+        function createMinimap() {
+            const minimap = document.createElement('div');
+            minimap.className = 'minimap';
+            
+            const title = document.createElement('div');
+            title.className = 'minimap-title';
+            title.textContent = 'navigation';
+            minimap.appendChild(title);
+            
+            // Find all headings and cells
+            const root = document.querySelector('.main-content') || document;
+            const headings = root.querySelectorAll('h1, h2, h3, h4, h5, h6');
+            const cells = root.querySelectorAll('.cell');
+            
+            // Combine and sort by position
+            const items = [];
+            
+            headings.forEach(heading => {
+                const id = heading.id || generateId(heading.textContent);
+                if (!heading.id) heading.id = id;
+                
+                items.push({
+                    element: heading,
+                    type: 'heading',
+                    level: parseInt(heading.tagName.charAt(1)),
+                    text: heading.textContent.trim(),
+                    id: id,
+                    position: heading.getBoundingClientRect().top + window.scrollY
+                });
+            });
+            
+            cells.forEach(cell => {
+                const header = cell.querySelector('.cell-header');
+                if (header) {
+                    const id = cell.id || `cell-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`;
+                    if (!cell.id) cell.id = id;
+                    
+                    items.push({
+                        element: cell,
+                        type: 'cell',
+                        text: header.textContent.trim(),
+                        id: id,
+                        position: cell.getBoundingClientRect().top + window.scrollY
+                    });
+                }
+            });
+            
+            // Sort by position
+            items.sort((a, b) => a.position - b.position);
+            
+            // Create minimap items
+            items.forEach(item => {
+                const link = document.createElement('a');
+                link.className = `minimap-item ${item.type === 'heading' ? 'minimap-heading' : 'minimap-cell'}`;
+                if (item.type === 'heading') {
+                    link.classList.add(`h${item.level}`);
+                }
+                link.textContent = item.text.length > 25 ? item.text.substring(0, 22) + '...' : item.text;
+                link.href = `#${item.id}`;
+                link.onclick = function(e) {
+                    e.preventDefault();
+                    item.element.scrollIntoView({ behavior: 'smooth', block: 'start' });
+                };
+                minimap.appendChild(link);
+            });
+            
+            return minimap;
+        }
+        
+        function generateId(text) {
+            return text.toLowerCase()
+                .replace(/[^a-z0-9]+/g, '-')
+                .replace(/^-+|-+$/g, '')
+                .substring(0, 20);
+        }
+        
+        function updateMinimapActive() {
+            const minimapItems = document.querySelectorAll('.minimap-item');
+            const container = _minimapScrollContainer || window;
+            const containerRect = container === window ? null : container.getBoundingClientRect();
+            const scrollPos = (container === window ? window.scrollY : container.scrollTop) + 100; // Offset for better detection
+            
+            let activeItem = null;
+            minimapItems.forEach(item => {
+                const targetId = item.getAttribute('href').substring(1);
+                const target = document.getElementById(targetId);
+                
+                if (target) {
+                    const rectTop = target.getBoundingClientRect().top;
+                    const targetPos = (container === window)
+                        ? rectTop + window.scrollY
+                        : rectTop - containerRect.top + container.scrollTop;
+                    if (targetPos <= scrollPos) {
+                        activeItem = item;
+                    }
+                }
+                
+                item.classList.remove('active');
+            });
+            
+            if (activeItem) {
+                activeItem.classList.add('active');
+            }
+        }
+        
+        function createFileExplorer() {
+            const fileExplorer = document.createElement('div');
+            fileExplorer.className = 'file-explorer';
+            
+            const title = document.createElement('div');
+            title.className = 'file-explorer-title';
+            title.textContent = 'files';
+            fileExplorer.appendChild(title);
+            // Make draggable (use title as handle)
+            makeDraggable(fileExplorer, 'uvnote-file-explorer-pos', title);
+            
+            // Scripts section
+            const scriptsSection = document.createElement('div');
+            scriptsSection.className = 'file-explorer-section';
+            
+            const scriptsTitle = document.createElement('div');
+            scriptsTitle.className = 'file-explorer-section-title';
+            scriptsTitle.textContent = 'scripts';
+            scriptsSection.appendChild(scriptsTitle);
+            
+            // Find all cells and list their script files (single panel)
+            const root = document.querySelector('.main-content') || document;
+            const cells = root.querySelectorAll('.cell');
+            cells.forEach(cell => {
+                const header = cell.querySelector('.cell-header');
+                if (header) {
+                    const cellText = header.textContent.trim();
+                    const cellMatch = cellText.match(/Cell: ([a-zA-Z_][a-zA-Z0-9_]*)/);
+                    if (cellMatch) {
+                        const cellId = cellMatch[1];
+                        const scriptItem = document.createElement('div');
+                        scriptItem.className = 'file-explorer-item script';
+                        scriptItem.textContent = `${cellId}.py`;
+                        scriptItem.onclick = function() {
+                            cell.scrollIntoView({ behavior: 'smooth', block: 'start' });
+                        };
+                        scriptsSection.appendChild(scriptItem);
+                    }
+                }
+            });
+            
+            fileExplorer.appendChild(scriptsSection);
+            
+            // Artifacts section
+            const artifactsSection = document.createElement('div');
+            artifactsSection.className = 'file-explorer-section';
+            
+            const artifactsTitle = document.createElement('div');
+            artifactsTitle.className = 'file-explorer-section-title';
+            artifactsTitle.textContent = 'artifacts';
+            artifactsSection.appendChild(artifactsTitle);
+            
+            // Find all artifact links (single panel)
+            const artifactsRoot = document.querySelector('.main-content') || document;
+            const artifacts = artifactsRoot.querySelectorAll('.artifact');
+            if (artifacts.length === 0) {
+                const noArtifacts = document.createElement('div');
+                noArtifacts.className = 'file-explorer-item artifact';
+                noArtifacts.textContent = '(none)';
+                noArtifacts.style.opacity = '0.5';
+                artifactsSection.appendChild(noArtifacts);
+            } else {
+                artifacts.forEach(artifact => {
+                    const artifactItem = document.createElement('div');
+                    artifactItem.className = 'file-explorer-item artifact';
+                    artifactItem.textContent = artifact.textContent;
+                    artifactItem.onclick = function() {
+                        artifact.click();
+                    };
+                    artifactsSection.appendChild(artifactItem);
+                });
+            }
+            
+            fileExplorer.appendChild(artifactsSection);
+            
+            return fileExplorer;
+        }
+
+        function initStatusWidget() {
+            let el = document.querySelector('.status-widget');
+            if (!el) {
+                el = document.createElement('div');
+                el.className = 'status-widget';
+                el.id = 'status-widget';
+                el.textContent = 'ready — Esc';
+                document.body.appendChild(el);
+            }
+        }
+
+        // Tools widget
+        let _cursorX = 0;
+        let _cursorY = 0;
+        let _cursorVisible = false;
+        
+        function setActiveTool(tool) {
+            if (!tool || tool === 'none') {
+                document.body.dataset.tool = 'none';
+                localStorage.setItem('uvnote-active-tool', 'none');
+                setOverlayActive(false);
+                _cursorVisible = false;
+                // Remove active class from all tool buttons when deactivating
+                const toolButtons = document.querySelectorAll('.tools-widget .tool-button');
+                toolButtons.forEach(btn => btn.classList.remove('active'));
+                updateStateIndicator();
+                encodeToolStateToUrl();
+                return;
+            }
+            document.body.dataset.tool = tool;
+            localStorage.setItem('uvnote-active-tool', tool);
+            setOverlayActive(true);
+            _cursorVisible = true;
+            updateStateIndicator();
+            encodeToolStateToUrl();
+        }
+
+        // Make setActiveTool globally accessible for ESC key handler
+        window.setActiveTool = setActiveTool;
+        
+
+        function getArrowColor() {
+            const saved = localStorage.getItem('uvnote-arrow-color');
+            if (saved) return saved;
+            return '#e53935'; // Default red color
+        }
+
+        function setStoredArrowColor(color) {
+            try { localStorage.setItem('uvnote-arrow-color', color); } catch (_) {}
+        }
+
+        function getLineThickness() {
+            const saved = localStorage.getItem('uvnote-line-thickness');
+            if (saved) return parseInt(saved, 10);
+            return 6; // default thickness
+        }
+
+        function setStoredLineThickness(thickness) {
+            try { localStorage.setItem('uvnote-line-thickness', thickness); } catch (_) {}
+        }
+
+        function getFadeoutTime() {
+            const saved = localStorage.getItem('uvnote-fadeout-time');
+            if (saved) return parseInt(saved, 10);
+            return 5; // default 5 seconds
+        }
+
+        function setStoredFadeoutTime(seconds) {
+            try { localStorage.setItem('uvnote-fadeout-time', seconds); } catch (_) {}
+        }
+
+        function createToolsWidget() {
+            const tools = document.createElement('div');
+            tools.className = 'tools-widget';
+
+            const title = document.createElement('div');
+            title.className = 'tools-title';
+            title.textContent = 'tools';
+            tools.appendChild(title);
+
+            const row = document.createElement('div');
+            row.className = 'tools-row';
+            tools.appendChild(row);
+
+            // Arrow tool
+            const arrowBtn = document.createElement('div');
+            arrowBtn.className = 'tool-button';
+            arrowBtn.textContent = 'arrow';
+            arrowBtn.onclick = function() {
+                const isActive = arrowBtn.classList.contains('active');
+                if (isActive) {
+                    arrowBtn.classList.remove('active');
+                    setActiveTool('none');
+                } else {
+                    tools.querySelectorAll('.tool-button').forEach(b => b.classList.remove('active'));
+                    arrowBtn.classList.add('active');
+                    setActiveTool('arrow');
+                }
+            };
+            row.appendChild(arrowBtn);
+
+            // Pen tool
+            const penBtn = document.createElement('div');
+            penBtn.className = 'tool-button';
+            penBtn.textContent = 'pen';
+            penBtn.onclick = function() {
+                const isActive = penBtn.classList.contains('active');
+                if (isActive) {
+                    penBtn.classList.remove('active');
+                    setActiveTool('none');
+                } else {
+                    tools.querySelectorAll('.tool-button').forEach(b => b.classList.remove('active'));
+                    penBtn.classList.add('active');
+                    setActiveTool('pen');
+                }
+            };
+            row.appendChild(penBtn);
+
+            // Eraser tool
+            const eraseBtn = document.createElement('div');
+            eraseBtn.className = 'tool-button';
+            eraseBtn.textContent = 'eraser';
+            eraseBtn.onclick = function() {
+                const isActive = eraseBtn.classList.contains('active');
+                if (isActive) {
+                    eraseBtn.classList.remove('active');
+                    setActiveTool('none');
+                } else {
+                    tools.querySelectorAll('.tool-button').forEach(b => b.classList.remove('active'));
+                    eraseBtn.classList.add('active');
+                    setActiveTool('eraser');
+                }
+            };
+            row.appendChild(eraseBtn);
+
+            // Spotlight tool
+            const spotlightBtn = document.createElement('div');
+            spotlightBtn.className = 'tool-button';
+            spotlightBtn.textContent = 'spotlight';
+            spotlightBtn.onclick = function() {
+                const isActive = spotlightBtn.classList.contains('active');
+                if (isActive) {
+                    spotlightBtn.classList.remove('active');
+                    setActiveTool('none');
+                } else {
+                    tools.querySelectorAll('.tool-button').forEach(b => b.classList.remove('active'));
+                    spotlightBtn.classList.add('active');
+                    setActiveTool('spotlight');
+                }
+            };
+            row.appendChild(spotlightBtn);
+
+            // Clear all
+            const clearBtn = document.createElement('div');
+            clearBtn.className = 'tool-button';
+            clearBtn.textContent = 'clear';
+            clearBtn.onclick = function() {
+                _shapes = [];
+                saveShapes();
+                renderOverlay();
+            };
+            row.appendChild(clearBtn);
+            
+            // We'll add the copy button at the end of the widget
+
+            // Restore active state from storage
+            const saved = localStorage.getItem('uvnote-active-tool') || 'none';
+            if (saved === 'arrow') {
+                arrowBtn.classList.add('active');
+                setActiveTool('arrow');
+            } else if (saved === 'pen') {
+                penBtn.classList.add('active');
+                setActiveTool('pen');
+            } else if (saved === 'eraser') {
+                eraseBtn.classList.add('active');
+                setActiveTool('eraser');
+            } else if (saved === 'spotlight') {
+                spotlightBtn.classList.add('active');
+                setActiveTool('spotlight');
+            }
+
+            // Color selector
+            const colorTitle = document.createElement('div');
+            colorTitle.className = 'tools-section-title';
+            colorTitle.textContent = 'color';
+            tools.appendChild(colorTitle);
+
+            const colorRow = document.createElement('div');
+            colorRow.className = 'tools-row color-row';
+            tools.appendChild(colorRow);
+
+            const swatchColors = [
+                // Primary colors
+                '#e53935', '#fb8c00', '#fdd835', '#43a047', '#1e88e5', '#8e24aa',
+                // Additional useful colors  
+                '#ff5722', '#795548', '#607d8b', '#9c27b0',
+                // Grayscale
+                '#000000', '#424242', '#9e9e9e', '#ffffff'
+            ];
+            const swatches = [];
+            swatchColors.forEach(c => {
+                const s = document.createElement('div');
+                s.className = 'color-swatch';
+                s.style.backgroundColor = c;
+                s.title = c;
+                s.onclick = () => {
+                    setStoredArrowColor(c);
+                    refreshColorUI(c);
+                    if (_cursorVisible) renderOverlay();
+                    encodeToolStateToUrl();
+                };
+                colorRow.appendChild(s);
+                swatches.push(s);
+            });
+
+            const colorInput = document.createElement('input');
+            colorInput.type = 'color';
+            colorInput.className = 'color-input';
+            colorInput.oninput = () => {
+                setStoredArrowColor(colorInput.value);
+                refreshColorUI(colorInput.value);
+                if (_cursorVisible) renderOverlay();
+                encodeToolStateToUrl();
+            };
+            colorRow.appendChild(colorInput);
+
+            function refreshColorUI(selected) {
+                const selectedHex = selected.startsWith('#') ? selected.toLowerCase() : rgbToHex(selected);
+                
+                swatches.forEach((s, i) => {
+                    const swatchHex = swatchColors[i].toLowerCase();
+                    if (swatchHex === selectedHex) {
+                        s.classList.add('selected');
+                    } else {
+                        s.classList.remove('selected');
+                    }
+                });
+                
+                try { 
+                    colorInput.value = selectedHex; 
+                } catch (_) {}
+            }
+
+            function rgbToHex(rgb) {
+                const m = rgb.match(/rgba?\((\d+),\s*(\d+),\s*(\d+)\)/i);
+                if (!m) return '#000000';
+                const r = parseInt(m[1]).toString(16).padStart(2, '0');
+                const g = parseInt(m[2]).toString(16).padStart(2, '0');
+                const b = parseInt(m[3]).toString(16).padStart(2, '0');
+                return `#${r}${g}${b}`;
+            }
+
+            // Restore color selection
+            refreshColorUI(getArrowColor());
+
+            // Thickness slider
+            const thicknessTitle = document.createElement('div');
+            thicknessTitle.className = 'tools-section-title';
+            thicknessTitle.textContent = 'thickness';
+            tools.appendChild(thicknessTitle);
+
+            const thicknessRow = document.createElement('div');
+            thicknessRow.className = 'thickness-row';
+            tools.appendChild(thicknessRow);
+
+            const thicknessSlider = document.createElement('input');
+            thicknessSlider.type = 'range';
+            thicknessSlider.className = 'thickness-slider';
+            thicknessSlider.min = '1';
+            thicknessSlider.max = '10';
+            thicknessSlider.value = getLineThickness();
+            
+            const thicknessValue = document.createElement('span');
+            thicknessValue.className = 'thickness-value';
+            thicknessValue.textContent = thicknessSlider.value + 'px';
+
+            thicknessSlider.oninput = function() {
+                const value = parseInt(thicknessSlider.value, 10);
+                setStoredLineThickness(value);
+                thicknessValue.textContent = value + 'px';
+                if (_cursorVisible) renderOverlay();
+                encodeToolStateToUrl();
+            };
+
+            thicknessRow.appendChild(thicknessSlider);
+            thicknessRow.appendChild(thicknessValue);
+
+            // Fadeout time slider
+            const fadeoutTitle = document.createElement('div');
+            fadeoutTitle.className = 'tools-section-title';
+            fadeoutTitle.textContent = 'fadeout time';
+            tools.appendChild(fadeoutTitle);
+
+            const fadeoutRow = document.createElement('div');
+            fadeoutRow.className = 'thickness-row';
+            tools.appendChild(fadeoutRow);
+
+            const fadeoutSlider = document.createElement('input');
+            fadeoutSlider.type = 'range';
+            fadeoutSlider.className = 'thickness-slider';
+            fadeoutSlider.min = '0';
+            fadeoutSlider.max = '30';
+            fadeoutSlider.value = getFadeoutTime();
+            
+            const fadeoutValue = document.createElement('span');
+            fadeoutValue.className = 'thickness-value';
+            fadeoutValue.textContent = fadeoutSlider.value === '0' ? 'never' : fadeoutSlider.value + 's';
+
+            fadeoutSlider.oninput = function() {
+                const value = parseInt(fadeoutSlider.value, 10);
+                setStoredFadeoutTime(value);
+                fadeoutValue.textContent = value === 0 ? 'never' : value + 's';
+                encodeToolStateToUrl();
+            };
+
+            fadeoutRow.appendChild(fadeoutSlider);
+            fadeoutRow.appendChild(fadeoutValue);
+
+            // Draggable behavior
+            makeDraggable(tools, 'uvnote-tools-pos', title);
+
+            return tools;
+        }
+
+        function initTools() {
+            const widget = createToolsWidget();
+            document.body.appendChild(widget);
+        }
+
+        function teardownTools() {
+            const w = document.querySelector('.tools-widget');
+            if (w && w.parentNode) w.parentNode.removeChild(w);
+        }
+
+        // --- Canvas overlay for tools ---
+        let _overlay = null;
+        let _overlayCtx = null;
+        let _overlayContainer = null; // window
+        let _overlayMode = 'single';
+        let _overlayResizeHandler = null;
+        let _overlayScrollHandler = null;
+        let _drawing = null; // current in-progress arrow {x1,y1,x2,y2}
+        let _shapes = []; // committed shapes for current mode
+        let _fadeTimer = null; // timer for fade animation
+        let _urlLoadedTool = false; // track if tool was loaded from URL
+        let _isInitializing = true; // prevent URL updates during initialization
+        let _initialCellStates = {}; // track initial cell states from page load
+
+        function getOverlayStorageKey() { return 'uvnote-shapes'; }
+
+        function loadShapes() {
+            try {
+                const raw = localStorage.getItem(getOverlayStorageKey());
+                _shapes = raw ? JSON.parse(raw) : [];
+            } catch (_) { _shapes = []; }
+        }
+
+        function saveShapes() {
+            try { 
+                localStorage.setItem(getOverlayStorageKey(), JSON.stringify(_shapes));
+                // Always update URL when shapes change
+                encodeToolStateToUrl();
+            } catch (_) {}
+        }
+
+        function updateShapesFade() {
+            const now = Date.now();
+            let needsUpdate = false;
+
+            for (let i = _shapes.length - 1; i >= 0; i--) {
+                const shape = _shapes[i];
+                if (!shape.createdAt) continue; // Skip old shapes without timestamps
+                
+                // Use individual shape's fadeout time, or global if not set
+                const shapesFadeoutSeconds = shape.fadeoutTime !== undefined ? shape.fadeoutTime : getFadeoutTime();
+                
+                // Skip fading if fadeout is disabled for this shape
+                if (shapesFadeoutSeconds === 0) continue;
+                
+                const fadeStartTime = Math.max(0, (shapesFadeoutSeconds - 2) * 1000); // Start fading 2s before end
+                const fadeEndTime = shapesFadeoutSeconds * 1000; // Fully gone after specified time
+                const age = now - shape.createdAt;
+                
+                if (age >= fadeEndTime) {
+                    // Remove completely faded shapes
+                    _shapes.splice(i, 1);
+                    needsUpdate = true;
+                } else if (age >= fadeStartTime) {
+                    // Update opacity for fading shapes
+                    const fadeProgress = (age - fadeStartTime) / (fadeEndTime - fadeStartTime);
+                    const newOpacity = 1 - fadeProgress;
+                    if (Math.abs(shape.opacity - newOpacity) > 0.01) {
+                        shape.opacity = newOpacity;
+                        needsUpdate = true;
+                    }
+                }
+            }
+
+            if (needsUpdate) {
+                saveShapes();
+                renderOverlay();
+                // Update URL to remove faded shapes  
+                encodeToolStateToUrl();
+            }
+        }
+
+        function getContentContainer() { return window; }
+
+        function updateOverlayModeAndContainer() {
+            _overlayContainer = window;
+            _overlayMode = 'single';
+        }
+
+        function updateOverlayBounds() {
+            if (!_overlay) return;
+            if (_overlayContainer === window) {
+                _overlay.style.position = 'fixed';
+                _overlay.style.left = '0px';
+                _overlay.style.top = '0px';
+                _overlay.width = window.innerWidth;
+                _overlay.height = window.innerHeight;
+            } else {
+                const rect = _overlayContainer.getBoundingClientRect();
+                _overlay.style.position = 'fixed';
+                _overlay.style.left = rect.left + 'px';
+                _overlay.style.top = rect.top + 'px';
+                _overlay.width = Math.max(0, Math.floor(rect.width));
+                _overlay.height = Math.max(0, Math.floor(rect.height));
+            }
+            renderOverlay();
+        }
+
+        function containerScrollLeft() {
+            return (_overlayContainer === window) ? (window.scrollX || 0) : (_overlayContainer.scrollLeft || 0);
+        }
+        function containerScrollTop() {
+            return (_overlayContainer === window) ? (window.scrollY || 0) : (_overlayContainer.scrollTop || 0);
+        }
+
+        function toCanvasCoords(clientX, clientY) {
+            const rect = _overlay.getBoundingClientRect();
+            return { x: clientX - rect.left, y: clientY - rect.top };
+        }
+
+        function onPointerDown(e) {
+            const tool = document.body.dataset.tool;
+            if (tool === 'arrow') {
+                startDrawArrow(e);
+            } else if (tool === 'pen') {
+                startDrawPen(e);
+            } else if (tool === 'eraser') {
+                eraseAt(e);
+            } else if (tool === 'spotlight') {
+                startDrawSpotlight(e);
+            }
+        }
+
+        function onPointerMove(e) {
+            // Update cursor position
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _cursorX = pt.x;
+            _cursorY = pt.y;
+            
+            if (!_drawing) {
+                // Just update cursor position and re-render
+                if (_cursorVisible) {
+                    renderOverlay();
+                }
+                return;
+            }
+            
+            if (_drawing.type === 'pen') {
+                moveDrawPen(e);
+            } else if (_drawing.type === 'spotlight') {
+                moveDrawSpotlight(e);
+            } else {
+                moveDrawArrow(e);
+            }
+        }
+        
+        function onPointerEnter(e) {
+            _cursorVisible = document.body.dataset.tool !== 'none';
+            if (_cursorVisible) {
+                renderOverlay();
+            }
+        }
+        
+        function onPointerLeave(e) {
+            _cursorVisible = false;
+            renderOverlay();
+        }
+
+        function onPointerUp(e) {
+            if (!_drawing) return;
+            if (_drawing.type === 'pen') {
+                endDrawPen();
+            } else if (_drawing.type === 'spotlight') {
+                endDrawSpotlight();
+            } else {
+                endDrawArrow();
+            }
+        }
+
+        function startDrawArrow(e) {
+            if (document.body.dataset.tool !== 'arrow') return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _drawing = {
+                x1: pt.x + containerScrollLeft(),
+                y1: pt.y + containerScrollTop(),
+                x2: pt.x + containerScrollLeft(),
+                y2: pt.y + containerScrollTop(),
+                color: getArrowColor(),
+                width: getLineThickness()
+            };
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function moveDrawArrow(e) {
+            if (!_drawing) return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _drawing.x2 = pt.x + containerScrollLeft();
+            _drawing.y2 = pt.y + containerScrollTop();
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function endDrawArrow() {
+            if (!_drawing) return;
+            _shapes.push({ 
+                type: 'arrow', 
+                ..._drawing,
+                createdAt: Date.now(),
+                fadeoutTime: getFadeoutTime(),
+                opacity: 1.0
+            });
+            _drawing = null;
+            saveShapes();
+            renderOverlay();
+        }
+
+        function startDrawPen(e) {
+            if (document.body.dataset.tool !== 'pen') return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _drawing = {
+                type: 'pen',
+                points: [{
+                    x: pt.x + containerScrollLeft(),
+                    y: pt.y + containerScrollTop()
+                }],
+                color: getArrowColor(),
+                width: getLineThickness()
+            };
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function moveDrawPen(e) {
+            if (!_drawing || _drawing.type !== 'pen') return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _drawing.points.push({
+                x: pt.x + containerScrollLeft(),
+                y: pt.y + containerScrollTop()
+            });
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function endDrawPen() {
+            if (!_drawing || _drawing.type !== 'pen') return;
+            if (_drawing.points.length > 1) {
+                _shapes.push({ 
+                    ..._drawing,
+                    createdAt: Date.now(),
+                    fadeoutTime: getFadeoutTime(),
+                    opacity: 1.0
+                });
+            }
+            _drawing = null;
+            saveShapes();
+            renderOverlay();
+        }
+
+        function startDrawSpotlight(e) {
+            if (document.body.dataset.tool !== 'spotlight') return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _drawing = {
+                type: 'spotlight',
+                x: pt.x + containerScrollLeft(),
+                y: pt.y + containerScrollTop(),
+                radius: getLineThickness() * 20, // Use thickness to control spotlight size (bigger default)
+                color: getArrowColor()
+            };
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function moveDrawSpotlight(e) {
+            if (!_drawing || _drawing.type !== 'spotlight') return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            const dx = pt.x + containerScrollLeft() - _drawing.x;
+            const dy = pt.y + containerScrollTop() - _drawing.y;
+            _drawing.radius = Math.max(20, Math.sqrt(dx * dx + dy * dy)); // Minimum radius of 20
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function endDrawSpotlight() {
+            if (!_drawing || _drawing.type !== 'spotlight') return;
+            _shapes.push({ 
+                ..._drawing,
+                createdAt: Date.now(),
+                fadeoutTime: getFadeoutTime(),
+                opacity: 1.0
+            });
+            _drawing = null;
+            saveShapes();
+            renderOverlay();
+        }
+
+        function distPointToSegment(px, py, x1, y1, x2, y2) {
+            const dx = x2 - x1, dy = y2 - y1;
+            if (dx === 0 && dy === 0) return Math.hypot(px - x1, py - y1);
+            const t = Math.max(0, Math.min(1, ((px - x1) * dx + (py - y1) * dy) / (dx*dx + dy*dy)));
+            const cx = x1 + t * dx, cy = y1 + t * dy;
+            return Math.hypot(px - cx, py - cy);
+        }
+
+        function eraseAt(e) {
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            const x = pt.x + containerScrollLeft();
+            const y = pt.y + containerScrollTop();
+            const threshold = 10; // pixels
+            for (let i = _shapes.length - 1; i >= 0; i--) {
+                const s = _shapes[i];
+                if (s.type === 'arrow') {
+                    const d = distPointToSegment(x, y, s.x1, s.y1, s.x2, s.y2);
+                    if (d <= threshold) {
+                        _shapes.splice(i, 1);
+                        saveShapes();
+                        renderOverlay();
+                        break;
+                    }
+                } else if (s.type === 'pen' && s.points) {
+                    // Check if click is near any line segment in the pen stroke
+                    let minDist = Infinity;
+                    for (let j = 1; j < s.points.length; j++) {
+                        const d = distPointToSegment(x, y, s.points[j-1].x, s.points[j-1].y, s.points[j].x, s.points[j].y);
+                        minDist = Math.min(minDist, d);
+                    }
+                    if (minDist <= threshold) {
+                        _shapes.splice(i, 1);
+                        saveShapes();
+                        renderOverlay();
+                        break;
+                    }
+                }
+            }
+            e.preventDefault();
+        }
+
+        function drawArrow(ctx, x1, y1, x2, y2, color, width, opacity = 1.0) {
+            // Set opacity
+            const oldAlpha = ctx.globalAlpha;
+            ctx.globalAlpha = opacity;
+            
+            ctx.strokeStyle = color;
+            ctx.fillStyle = color;
+            ctx.lineWidth = width;
+            ctx.lineCap = 'round';
+            ctx.lineJoin = 'round';
+            
+            // Check if points are too close (initial state)
+            const dx = x2 - x1;
+            const dy = y2 - y1;
+            const distance = Math.sqrt(dx * dx + dy * dy);
+            
+            if (distance < 5) {
+                // Draw just a small arrowhead pointing down-right when first clicked
+                const defaultAngle = Math.PI / 4; // 45 degrees (down-right)
+                const headLength = Math.min(15 + width * 1.5, 25);
+                const headAngle = Math.PI / 6;
+                
+                // Calculate arrowhead points
+                const hx1 = x1 + headLength * Math.cos(defaultAngle - headAngle);
+                const hy1 = y1 + headLength * Math.sin(defaultAngle - headAngle);
+                const hx2 = x1 + headLength * Math.cos(defaultAngle + headAngle);
+                const hy2 = y1 + headLength * Math.sin(defaultAngle + headAngle);
+                
+                // Draw arrowhead only
+                ctx.beginPath();
+                ctx.moveTo(x1, y1);
+                ctx.lineTo(hx1, hy1);
+                ctx.lineTo(hx2, hy2);
+                ctx.closePath();
+                ctx.fill();
+            } else {
+                // Normal arrow drawing - head at x1,y1, tail at x2,y2
+                const angle = Math.atan2(y1 - y2, x1 - x2);
+                const headLength = Math.min(15 + width * 1.5, 25);
+                const headAngle = Math.PI / 6;
+                
+                // Calculate where the line should end (before the arrowhead)
+                const lineEndX = x1 - headLength * 0.8 * Math.cos(angle);
+                const lineEndY = y1 - headLength * 0.8 * Math.sin(angle);
+                
+                // Draw the line from tail to near the head
+                ctx.beginPath();
+                ctx.moveTo(x2, y2);
+                ctx.lineTo(lineEndX, lineEndY);
+                ctx.stroke();
+                
+                // Calculate arrowhead points
+                const hx1 = x1 - headLength * Math.cos(angle - headAngle);
+                const hy1 = y1 - headLength * Math.sin(angle - headAngle);
+                const hx2 = x1 - headLength * Math.cos(angle + headAngle);
+                const hy2 = y1 - headLength * Math.sin(angle + headAngle);
+                
+                // Draw arrowhead
+                ctx.beginPath();
+                ctx.moveTo(x1, y1);
+                ctx.lineTo(hx1, hy1);
+                ctx.lineTo(hx2, hy2);
+                ctx.closePath();
+                ctx.fill();
+            }
+            
+            // Restore opacity
+            ctx.globalAlpha = oldAlpha;
+        }
+
+        function drawPen(ctx, points, color, width, offX, offY, opacity = 1.0) {
+            if (!points || points.length < 2) return;
+            
+            // Set opacity
+            const oldAlpha = ctx.globalAlpha;
+            ctx.globalAlpha = opacity;
+            
+            ctx.strokeStyle = color;
+            ctx.lineWidth = width;
+            ctx.lineCap = 'round';
+            ctx.lineJoin = 'round';
+            ctx.beginPath();
+            ctx.moveTo(points[0].x - offX, points[0].y - offY);
+            for (let i = 1; i < points.length; i++) {
+                ctx.lineTo(points[i].x - offX, points[i].y - offY);
+            }
+            ctx.stroke();
+            
+            // Restore opacity
+            ctx.globalAlpha = oldAlpha;
+        }
+
+        function drawAllSpotlights(ctx, spotlights, offX, offY) {
+            if (!spotlights || spotlights.length === 0) return;
+            
+            ctx.save();
+            
+            // Calculate the overall opacity based on all spotlights
+            const maxOpacity = Math.max(...spotlights.map(s => s.opacity || 1.0));
+            
+            // Fill entire canvas with dark overlay
+            ctx.fillStyle = `rgba(0, 0, 0, ${0.7 * maxOpacity})`;
+            ctx.fillRect(0, 0, ctx.canvas.width, ctx.canvas.height);
+            
+            // Cut out completely transparent holes for all spotlights
+            ctx.globalCompositeOperation = 'destination-out';
+            ctx.fillStyle = 'rgba(0, 0, 0, 1)'; // Solid black to ensure complete removal
+            for (const spotlight of spotlights) {
+                ctx.beginPath();
+                ctx.arc(spotlight.x - offX, spotlight.y - offY, spotlight.radius, 0, 2 * Math.PI);
+                ctx.fill();
+            }
+            
+            ctx.restore();
+        }
+
+        function renderOverlay() {
+            if (!_overlay || !_overlayCtx) return;
+            _overlayCtx.clearRect(0, 0, _overlay.width, _overlay.height);
+            const offX = containerScrollLeft();
+            const offY = containerScrollTop();
+            // Draw non-spotlight shapes first
+            for (const s of _shapes) {
+                const opacity = s.opacity !== undefined ? s.opacity : 1.0;
+                if (s.type === 'arrow') {
+                    drawArrow(_overlayCtx, s.x1 - offX, s.y1 - offY, s.x2 - offX, s.y2 - offY, s.color || '#f00', s.width || 2, opacity);
+                } else if (s.type === 'pen') {
+                    drawPen(_overlayCtx, s.points, s.color || '#f00', s.width || 2, offX, offY, opacity);
+                }
+            }
+            // Draw current drawing (non-spotlight)
+            if (_drawing) {
+                if (_drawing.type === 'pen') {
+                    drawPen(_overlayCtx, _drawing.points, _drawing.color, _drawing.width, offX, offY);
+                } else if (_drawing.type !== 'spotlight') {
+                    drawArrow(_overlayCtx, _drawing.x1 - offX, _drawing.y1 - offY, _drawing.x2 - offX, _drawing.y2 - offY, _drawing.color, _drawing.width);
+                }
+            }
+            
+            // Collect all spotlights (existing + current drawing + cursor preview)
+            const spotlights = [];
+            
+            // Add existing spotlight shapes
+            for (const s of _shapes) {
+                if (s.type === 'spotlight') {
+                    spotlights.push({
+                        x: s.x,
+                        y: s.y, 
+                        radius: s.radius,
+                        opacity: s.opacity !== undefined ? s.opacity : 1.0
+                    });
+                }
+            }
+            
+            // Add current spotlight being drawn
+            if (_drawing && _drawing.type === 'spotlight') {
+                spotlights.push({
+                    x: _drawing.x,
+                    y: _drawing.y,
+                    radius: _drawing.radius,
+                    opacity: 1.0
+                });
+            }
+            
+            // Add cursor preview spotlight if tool is active
+            if (_cursorVisible && !_drawing) {
+                const tool = document.body.dataset.tool;
+                if (tool === 'spotlight') {
+                    const thickness = getLineThickness();
+                    const radius = thickness * 20;
+                    const cursorWorldX = _cursorX + containerScrollLeft();
+                    const cursorWorldY = _cursorY + containerScrollTop();
+                    spotlights.push({
+                        x: cursorWorldX,
+                        y: cursorWorldY,
+                        radius: radius,
+                        opacity: 0.8
+                    });
+                }
+            }
+            
+            // Draw all spotlights as a single overlay with multiple holes
+            drawAllSpotlights(_overlayCtx, spotlights, offX, offY);
+            
+            // Draw cursor indicators for non-spotlight tools
+            if (_cursorVisible && !_drawing) {
+                const tool = document.body.dataset.tool;
+                const color = getArrowColor();
+                const thickness = getLineThickness();
+                
+                if (tool !== 'spotlight') {
+                    _overlayCtx.save();
+                    _overlayCtx.fillStyle = color;
+                    _overlayCtx.globalAlpha = 0.7;
+                    
+                    if (tool === 'eraser') {
+                        // Draw eraser indicator
+                        _overlayCtx.strokeStyle = color;
+                        _overlayCtx.lineWidth = 2;
+                        _overlayCtx.beginPath();
+                        _overlayCtx.arc(_cursorX, _cursorY, 10, 0, 2 * Math.PI);
+                        _overlayCtx.stroke();
+                    } else {
+                        // Draw dot for pen/arrow
+                        _overlayCtx.beginPath();
+                        _overlayCtx.arc(_cursorX, _cursorY, thickness / 2, 0, 2 * Math.PI);
+                        _overlayCtx.fill();
+                    }
+                    
+                    _overlayCtx.restore();
+                }
+            }
+        }
+
+        function setOverlayActive(active) {
+            if (!_overlay) initOverlay();
+            _overlay.style.pointerEvents = active ? 'auto' : 'none';
+            _overlay.style.cursor = active ? 'none' : 'auto';
+            // Re-render to ensure visibility aligns with content
+            renderOverlay();
+        }
+
+        function initOverlay() {
+            if (_overlay) return;
+            updateOverlayModeAndContainer();
+            _overlay = document.createElement('canvas');
+            _overlay.className = 'draw-overlay';
+            _overlayCtx = _overlay.getContext('2d');
+            document.body.appendChild(_overlay);
+            updateOverlayBounds();
+            loadShapes();
+            renderOverlay();
+
+            // Events
+            _overlay.addEventListener('mousedown', onPointerDown);
+            _overlay.addEventListener('mousemove', onPointerMove);
+            _overlay.addEventListener('mouseenter', onPointerEnter);
+            _overlay.addEventListener('mouseleave', onPointerLeave);
+            document.addEventListener('mouseup', onPointerUp);
+            _overlay.addEventListener('touchstart', onPointerDown, { passive: false });
+            _overlay.addEventListener('touchmove', onPointerMove, { passive: false });
+            document.addEventListener('touchend', onPointerUp);
+
+            _overlayResizeHandler = () => updateOverlayBounds();
+            window.addEventListener('resize', _overlayResizeHandler);
+
+            _overlayScrollHandler = () => renderOverlay();
+            window.addEventListener('scroll', _overlayScrollHandler);
+            
+            // Start fade animation timer
+            _fadeTimer = setInterval(updateShapesFade, 100); // Update every 100ms for smooth fade
+        }
+
+        function rebindOverlayContainer() {
+            if (!_overlay) return;
+            // Remove old scroll handler
+            if (_overlayScrollHandler) { window.removeEventListener('scroll', _overlayScrollHandler); }
+            updateOverlayModeAndContainer();
+            updateOverlayBounds();
+            loadShapes();
+            renderOverlay();
+            _overlayScrollHandler = () => renderOverlay();
+            window.addEventListener('scroll', _overlayScrollHandler);
+        }
+
+        function teardownOverlay() {
+            if (!_overlay) return;
+            _overlay.removeEventListener('mousedown', onPointerDown);
+            _overlay.removeEventListener('mousemove', onPointerMove);
+            _overlay.removeEventListener('mouseenter', onPointerEnter);
+            _overlay.removeEventListener('mouseleave', onPointerLeave);
+            document.removeEventListener('mouseup', onPointerUp);
+            _overlay.removeEventListener('touchstart', onPointerDown);
+            _overlay.removeEventListener('touchmove', onPointerMove);
+            document.removeEventListener('touchend', onPointerUp);
+            if (_overlayResizeHandler) window.removeEventListener('resize', _overlayResizeHandler);
+            if (_overlayScrollHandler) {
+                if (_overlayContainer === window) {
+                    window.removeEventListener('scroll', _overlayScrollHandler);
+                } else if (_overlayContainer) {
+                    _overlayContainer.removeEventListener('scroll', _overlayScrollHandler);
+                }
+            }
+            if (_fadeTimer) {
+                clearInterval(_fadeTimer);
+                _fadeTimer = null;
+            }
+            if (_overlay.parentNode) _overlay.parentNode.removeChild(_overlay);
+            _overlay = null; _overlayCtx = null; _overlayContainer = null; _overlayResizeHandler = null; _overlayScrollHandler = null; _drawing = null;
+        }
+        
+        function teardownFileExplorer() {
+            const fe = document.querySelector('.file-explorer');
+            if (fe && fe.parentNode) fe.parentNode.removeChild(fe);
+        }
+
+        function escapeHtml(text) {
+            const div = document.createElement('div');
+            div.textContent = text;
+            return div.innerHTML;
+        }
+
+        function runCell(cellId){
+            const btn=document.querySelector('.run-btn[onclick*="'+cellId+'"]');
+            const output=document.getElementById('output-'+cellId);
+            if(btn){btn.textContent='⏳ running...';btn.disabled=true;}
+            if(output){output.classList.add('output-stale');}
+            fetch('/run/'+cellId,{method:'POST'}).then(r=>r.json()).then(data=>{
+                if(output){
+                    output.classList.remove('output-stale');
+                    let html='';
+                    if (data.stdout) {
+                    html += '<div class="cell-stdout"><pre class="stdout-text">'
+                        + escapeHtml(data.stdout)
+                        + '</pre></div>';
+                    }
+
+                    console.log('UV Logs:', data);
+                    if(data.stderr) {
+                        // Split UV logs from regular stderr
+                        const lines = data.stderr.split('\\n');
+                        let uvLogs = [];
+                        let regularLogs = [];
+                        let inUvSection = true;
+
+                        for (const line of lines) {
+                            if (inUvSection) {
+                                uvLogs.push(line);
+                                if (line.startsWith('Installed ')) {
+                                    inUvSection = false;
+                                }
+                            } else {
+                                regularLogs.push(line);
+                            }
+                        }
+                        
+
+                        // If we never found "Installed", treat it all as regular stderr
+                        if (inUvSection) {
+                            html+='<div class="cell-stderr">'+escapeHtml(data.stderr)+'</div>';
+                        } else {
+                            const uvLogsStr = uvLogs.join('\\n');
+                            const regularLogsStr = regularLogs.join('\\n').trim();
+
+                            if (uvLogsStr) {
+                                html+='<div class="uv-install-logs">';
+                                html+='<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>';
+                                html+='<div class="uv-logs-content" style="display: none;">'+escapeHtml(uvLogsStr)+'</div>';
+                                html+='</div>';
+                            }
+                            if (regularLogsStr) {
+                                html+='<div class="cell-stderr">'+escapeHtml(regularLogsStr)+'</div>';
+                            }
+                        }
+                    }
+                    output.innerHTML=html;
+                }
+                if(btn){btn.textContent='▶ run';btn.disabled=false;}
+            }).catch(e=>{
+                console.error('Run failed:',e);
+                if(output){output.classList.remove('output-stale');}
+                if(btn){btn.textContent='▶ run';btn.disabled=false;}
+            });
+        }
+
+        function copyCell(cellId){
+            // Try multiple selectors to find the code element
+            // Pygments generates .highlight > pre with spans, not wrapped in <code>
+            let codeElement = document.querySelector('#code-'+cellId+' .highlight pre');
+            if (!codeElement) {
+                codeElement = document.querySelector('#code-'+cellId+' pre');
+            }
+            if (!codeElement) {
+                codeElement = document.querySelector('#code-'+cellId+' code');
+            }
+            if (!codeElement) {
+                // Fallback to the code div itself
+                codeElement = document.getElementById('code-'+cellId);
+            }
+
+            const btn = document.querySelector('.copy-btn[onclick*="'+cellId+'"]');
+
+            if (!codeElement) {
+                console.error('Code element not found for cell:', cellId);
+                return;
+            }
+            if (!btn) {
+                console.error('Copy button not found for cell:', cellId);
+                return;
+            }
+
+            const codeText = codeElement.textContent;
+            
+            if (navigator.clipboard && navigator.clipboard.writeText) {
+                navigator.clipboard.writeText(codeText).then(function() {
+                    console.log('Clipboard copy successful');
+                    btn.textContent = '✓ Copied!';
+                    btn.classList.add('copied');
+                    setTimeout(function() {
+                        btn.textContent = 'Copy';
+                        btn.classList.remove('copied');
+                    }, 2000);
+                }).catch(function(err) {
+                    console.warn('Clipboard copy failed:', err);
+                    fallbackCopy();
+                });
+            } else {
+                console.log('Using fallback copy method');
+                fallbackCopy();
+            }
+            
+            function fallbackCopy() {
+                const textarea = document.createElement('textarea');
+                textarea.value = codeText;
+                textarea.style.position = 'absolute';
+                textarea.style.left = '-9999px';
+                document.body.appendChild(textarea);
+                textarea.select();
+                try {
+                    const success = document.execCommand('copy');
+                    console.log('Fallback copy success:', success);
+                    btn.textContent = '✓ Copied!';
+                    btn.classList.add('copied');
+                    setTimeout(function() {
+                        btn.textContent = 'Copy';
+                        btn.classList.remove('copied');
+                    }, 2000);
+                } catch (err) {
+                    console.error('Fallback copy failed:', err);
+                    btn.textContent = 'Copy failed';
+                    setTimeout(function() {
+                        btn.textContent = 'Copy';
+                    }, 2000);
+                }
+                document.body.removeChild(textarea);
+            }
+        }
+
+        // // Live reload functionality (robust SSE handling)
+        // (function(){
+        //     if (!('EventSource' in window)) {
+        //         console.warn('SSE not supported in this browser');
+        //         return;
+        //     }
+        //     let source = new EventSource('/events');
+        //     let isOpen = false;
+        //     source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
+        //     source.onmessage = function(e){
+        //         const msg=(e.data||'').trim(); if(!msg) return;
+        //         console.log('SSE message:', msg);
+        //         if (msg==='reload' || msg==='incremental') { location.reload(); }
+        //         // Ignore 'loading' to avoid premature reload loops
+        //     };
+        //     source.onerror = function(e){
+        //         // Let EventSource auto-reconnect instead of forcing a reload
+        //         if (isOpen) console.warn('SSE error after open, retrying...', e);
+        //     };
+        //     window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
+        // })();
+
+
+        document.addEventListener('DOMContentLoaded', function() {
+            // Capture initial cell states before any modifications
+            captureInitialCellStates();
+            
+            updateThemeIcon();
+            updateUiMenu();
+            updateUiDebug();
+            const widgetsEnabled = (document.documentElement.getAttribute('data-widgets') || 'on') === 'on';
+            if (widgetsEnabled) {
+            initMinimap();
+            initFileExplorer();
+            initTools();
+            initOverlay();
+            initStatusWidget();
+            initializeWidgetVisibility();
+            layoutWidgetsStackedBottomRight();
+            window.addEventListener('resize', layoutWidgetsStackedBottomRight);
+            }
+
+            // Apply deep-link selection if present
+            applyLocationFromUrl();
+            updateStateIndicator();
+            
+            // Apply cell states from URL immediately
+            const url = new URL(window.location.href);
+            const encodedCellStates = url.searchParams.get('cells');
+            if (encodedCellStates) {
+                console.log('Applying cell states from URL...');
+                const cellStates = decodeCellStatesFromUrl(encodedCellStates);
+                
+                // Use requestAnimationFrame to ensure DOM is ready
+                requestAnimationFrame(() => {
+                    applyCellStatesFromUrl(cellStates);
+                    
+                    // Clear initialization flag after cell states are applied
+                    if (typeof _isInitializing !== 'undefined') {
+                        _isInitializing = false;
+                    }
+                });
+            } else {
+                // Clear initialization flag even if no cell states
+                if (typeof _isInitializing !== 'undefined') {
+                    requestAnimationFrame(() => {
+                        _isInitializing = false;
+                    });
+                }
+            }
+
+            // Bind drag selection on line numbers
+            document.addEventListener('mousedown', onLineNumberMouseDown);
+            document.addEventListener('mousemove', onDocMouseMove);
+            document.addEventListener('mouseup', onDocMouseUp);
+
+            // Add ESC key handler to exit tools
+            document.addEventListener('keydown', function(e) {
+                if (e.key === 'Escape' || e.keyCode === 27) {
+                    const currentTool = document.body.dataset.tool;
+                    if (currentTool && currentTool !== 'none') {
+                        // Deactivate the current tool
+                        window.setActiveTool('none');
+                    }
+                    // Also clear any active line selection
+                    clearSelection(true);
+                }
+            });
+        });
+
+        function updateStateIndicator() {
+            try {
+                const el = document.getElementById('status-widget');
+                if (!el) return;
+                const tool = document.body.dataset.tool || 'none';
+                if (tool && tool !== 'none') {
+                    el.textContent = `tool: ${tool} — Esc`;
+                    return;
+                }
+                if (_selection) {
+                    const t = _selection.a === _selection.b ? `L${_selection.a}` : `L${_selection.a}-${_selection.b}`;
+                    el.textContent = `selected: ${t} — Esc`;
+                    return;
+                }
+                el.textContent = 'ready — Esc';
+            } catch (_) {}
+        }
+    </script>
+</head>
+
+
+<body>
+    <div class="controls">
+        <div class="controls-buttons">
+            
+            <a href="index.html" class="back-button">← back</a>
+            
+            <div class="theme-toggle" onclick="toggleTheme()">light</div>
+            <div class="reset-toggle" onclick="resetLayout()">reset</div>
+            <div class="menu-button" onclick="toggleMenu()">
+                menu ▼
+                <div class="menu-dropdown">
+                    <div class="menu-item" onclick="setUiTheme('default')">
+                        <span class="menu-checkbox" id="checkbox-ui-default">☑</span> Theme: default
+                    </div>
+                    <div class="menu-item" onclick="setUiTheme('none')">
+                        <span class="menu-checkbox" id="checkbox-ui-none">☐</span> Theme: none
+                    </div>
+                    <div class="menu-item" onclick="setUiTheme('monocolor')">
+                        <span class="menu-checkbox" id="checkbox-ui-monocolor">☐</span> Theme: monocolor
+                    </div>
+                    <div class="menu-item" onclick="toggleWidget('tools')">
+                        <span class="menu-checkbox" id="checkbox-tools">☐</span> Tools
+                    </div>
+                    <div class="menu-item" onclick="toggleWidget('file-explorer')">
+                        <span class="menu-checkbox" id="checkbox-file-explorer">☐</span> File Explorer
+                    </div>
+                    <div class="menu-item" onclick="toggleWidget('minimap')">
+                        <span class="menu-checkbox" id="checkbox-minimap">☐</span> Table of Contents
+                    </div>
+                    <div class="menu-item" onclick="toggleWidget('status')">
+                        <span class="menu-checkbox" id="checkbox-status">☑</span> Status Indicator
+                    </div>
+                </div>
+            </div>
+        </div>
+    </div>
+    
+    <div class="system-info">
+        <div class="system-info-header">Generated on:</div>
+        <div class="system-info-content">
+            Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
+        </div>
+    </div>
+    
+    <div class="main-content">
+        <h1>Binned PyTorch - OpenAI-style MoE</h1>
+<h2>GPU Info</h2>
+<div class="cell" id="cell-nv">
+<div class="cell-header">
+<span class="collapse-indicators">
+<span onclick="toggleCode('nv')" style="cursor: pointer;">▼ code</span> 
+<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
+ <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
+</span> | 
+Cell: nv | 0.24s
+ | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
+<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
+<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
+<a href="https://github.com/huggingface/kernels-benchmarks/blob/main/openai_moe/impls/binned_torch.md" target="_blank" class="github-btn">GitHub</a>
+</div>
+<div id="code-nv" class="cell-code" data-lines="2">
+<div class="code-wrap">
+<div class="highlight"><pre><span></span><span class="kn">import</span><span class="w"> </span><span class="nn">subprocess</span>
+<span class="nb">print</span><span class="p">(</span><span class="n">subprocess</span><span class="o">.</span><span class="n">run</span><span class="p">([</span><span class="s2">&quot;nvidia-smi&quot;</span><span class="p">],</span> <span class="n">capture_output</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">text</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span><span class="o">.</span><span class="n">stdout</span><span class="p">)</span>
+</pre></div>
+
+<div class="code-line-highlight" id="line-highlight-nv"></div>
+</div>
+</div>
+<div id="output-nv" class="cell-output">
+<div class="cell-stdout"><pre class="stdout-text">Fri Oct 31 20:00:34 2025       
++-----------------------------------------------------------------------------------------+
+| NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
+|-----------------------------------------+------------------------+----------------------+
+| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
+| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
+|                                         |                        |               MIG M. |
+|=========================================+========================+======================|
+|   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
+| N/A   34C    P0             81W /  350W |       0MiB /  46068MiB |     18%      Default |
+|                                         |                        |                  N/A |
++-----------------------------------------+------------------------+----------------------+
+
++-----------------------------------------------------------------------------------------+
+| Processes:                                                                              |
+|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
+|        ID   ID                                                               Usage      |
+|=========================================================================================|
+|  No running processes found                                                             |
++-----------------------------------------------------------------------------------------+
+
+</pre></div>
+</div>
+</div>
+
+<h2>OpenAI-style MoE Benchmark (Binned PyTorch)</h2>
+<div class="cell" id="cell-benchmark">
+<div class="cell-header">
+<span class="collapse-indicators">
+<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span> 
+<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
+ <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
+</span> | 
+Cell: benchmark | 727.85s
+ | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
+<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
+<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
+<a href="https://github.com/huggingface/kernels-benchmarks/blob/main/openai_moe/impls/binned_torch.md" target="_blank" class="github-btn">GitHub</a>
+</div>
+<div id="code-benchmark" class="cell-code" data-lines="136">
+<div class="code-wrap">
+<div class="highlight"><pre><span></span><span class="c1"># /// script</span>
+<span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
+<span class="c1"># dependencies = [</span>
+<span class="c1">#     &quot;numpy&quot;,</span>
+<span class="c1">#     &quot;torch==2.8.0&quot;,</span>
+<span class="c1">#     &quot;kernels-benchmark-tools&quot;,</span>
+<span class="c1"># ]</span>
+<span class="c1">#</span>
+<span class="c1"># [tool.uv.sources]</span>
+<span class="c1"># kernels-benchmark-tools = { path = &quot;../../../../../tools&quot;, editable = true }</span>
+<span class="c1"># ///</span>
+<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
+<span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
+<span class="kn">from</span><span class="w"> </span><span class="nn">kernels_benchmark_tools</span><span class="w"> </span><span class="kn">import</span> <span class="n">KernelTypeEnum</span><span class="p">,</span> <span class="n">run_benchmark</span>
+
+
+<span class="k">def</span><span class="w"> </span><span class="nf">binned_gather</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">indices</span><span class="p">,</span> <span class="n">bins</span><span class="p">,</span> <span class="n">expert_capacity</span><span class="p">,</span> <span class="n">top_k</span><span class="p">):</span>
+    <span class="n">E</span><span class="p">,</span> <span class="n">H</span> <span class="o">=</span> <span class="n">bins</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">x</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span>
+    <span class="n">out</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">zeros</span><span class="p">((</span><span class="n">E</span><span class="p">,</span> <span class="n">expert_capacity</span><span class="p">,</span> <span class="n">H</span><span class="p">),</span> <span class="n">device</span><span class="o">=</span><span class="n">x</span><span class="o">.</span><span class="n">device</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">x</span><span class="o">.</span><span class="n">dtype</span><span class="p">)</span>
+    <span class="k">for</span> <span class="n">e</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">E</span><span class="p">):</span>
+        <span class="n">start</span> <span class="o">=</span> <span class="mi">0</span> <span class="k">if</span> <span class="n">e</span> <span class="o">==</span> <span class="mi">0</span> <span class="k">else</span> <span class="n">bins</span><span class="p">[</span><span class="n">e</span> <span class="o">-</span> <span class="mi">1</span><span class="p">]</span>
+        <span class="n">end</span> <span class="o">=</span> <span class="n">bins</span><span class="p">[</span><span class="n">e</span><span class="p">]</span>
+        <span class="n">n</span> <span class="o">=</span> <span class="nb">min</span><span class="p">(</span><span class="n">end</span> <span class="o">-</span> <span class="n">start</span><span class="p">,</span> <span class="n">expert_capacity</span><span class="p">)</span>
+        <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">n</span><span class="p">):</span>
+            <span class="n">flat_pos</span> <span class="o">=</span> <span class="n">indices</span><span class="p">[</span><span class="n">start</span> <span class="o">+</span> <span class="n">i</span><span class="p">]</span>
+            <span class="n">tok</span> <span class="o">=</span> <span class="n">flat_pos</span> <span class="o">//</span> <span class="n">top_k</span>
+            <span class="n">out</span><span class="p">[</span><span class="n">e</span><span class="p">,</span> <span class="n">i</span><span class="p">]</span> <span class="o">=</span> <span class="n">x</span><span class="p">[</span><span class="n">tok</span><span class="p">]</span>
+    <span class="k">return</span> <span class="n">out</span>
+
+
+<span class="k">def</span><span class="w"> </span><span class="nf">binned_scatter</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">indices</span><span class="p">,</span> <span class="n">weights</span><span class="p">,</span> <span class="n">bins</span><span class="p">,</span> <span class="n">expert_capacity</span><span class="p">,</span> <span class="n">top_k</span><span class="p">):</span>
+    <span class="n">E</span><span class="p">,</span> <span class="n">C</span><span class="p">,</span> <span class="n">H</span> <span class="o">=</span> <span class="n">x</span><span class="o">.</span><span class="n">shape</span>
+    <span class="n">N</span> <span class="o">=</span> <span class="n">indices</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">//</span> <span class="n">top_k</span>
+    <span class="n">out</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">zeros</span><span class="p">((</span><span class="n">N</span><span class="p">,</span> <span class="n">top_k</span><span class="p">,</span> <span class="n">H</span><span class="p">),</span> <span class="n">dtype</span><span class="o">=</span><span class="n">x</span><span class="o">.</span><span class="n">dtype</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="n">x</span><span class="o">.</span><span class="n">device</span><span class="p">)</span>
+    <span class="k">for</span> <span class="n">e</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">E</span><span class="p">):</span>
+        <span class="n">start</span> <span class="o">=</span> <span class="mi">0</span> <span class="k">if</span> <span class="n">e</span> <span class="o">==</span> <span class="mi">0</span> <span class="k">else</span> <span class="n">bins</span><span class="p">[</span><span class="n">e</span> <span class="o">-</span> <span class="mi">1</span><span class="p">]</span>
+        <span class="n">end</span> <span class="o">=</span> <span class="n">bins</span><span class="p">[</span><span class="n">e</span><span class="p">]</span>
+        <span class="n">n</span> <span class="o">=</span> <span class="n">end</span> <span class="o">-</span> <span class="n">start</span>
+        <span class="k">if</span> <span class="n">n</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
+            <span class="k">continue</span>
+        <span class="n">take</span> <span class="o">=</span> <span class="nb">min</span><span class="p">(</span><span class="n">n</span><span class="p">,</span> <span class="n">expert_capacity</span><span class="p">)</span>
+        <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">take</span><span class="p">):</span>
+            <span class="n">flat_pos</span> <span class="o">=</span> <span class="n">indices</span><span class="p">[</span><span class="n">start</span> <span class="o">+</span> <span class="n">i</span><span class="p">]</span>  <span class="c1"># flattened (token, slot)</span>
+            <span class="n">tok</span> <span class="o">=</span> <span class="n">flat_pos</span> <span class="o">//</span> <span class="n">top_k</span>
+            <span class="n">slot</span> <span class="o">=</span> <span class="n">flat_pos</span> <span class="o">%</span> <span class="n">top_k</span>
+            <span class="n">scale</span> <span class="o">=</span> <span class="n">weights</span><span class="p">[</span><span class="n">flat_pos</span><span class="p">]</span> <span class="k">if</span> <span class="n">weights</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="k">else</span> <span class="mf">1.0</span>
+            <span class="n">out</span><span class="p">[</span><span class="n">tok</span><span class="p">,</span> <span class="n">slot</span><span class="p">]</span> <span class="o">=</span> <span class="n">x</span><span class="p">[</span><span class="n">e</span><span class="p">,</span> <span class="n">i</span><span class="p">]</span> <span class="o">*</span> <span class="n">scale</span>
+    <span class="k">return</span> <span class="n">out</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="n">dim</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
+
+
+<span class="k">def</span><span class="w"> </span><span class="nf">sort_tokens_by_expert</span><span class="p">(</span><span class="n">router_indices</span><span class="p">,</span> <span class="n">num_experts</span><span class="p">):</span>
+    <span class="n">flat_indices</span> <span class="o">=</span> <span class="n">router_indices</span><span class="o">.</span><span class="n">flatten</span><span class="p">()</span>
+    <span class="n">sorted_values</span><span class="p">,</span> <span class="n">sorted_indices</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">sort</span><span class="p">(</span><span class="n">flat_indices</span><span class="p">)</span>
+    <span class="n">tokens_per_expert</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">bincount</span><span class="p">(</span><span class="n">sorted_values</span><span class="p">,</span> <span class="n">minlength</span><span class="o">=</span><span class="n">num_experts</span><span class="p">)</span>
+    <span class="n">bins</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">cumsum</span><span class="p">(</span><span class="n">tokens_per_expert</span><span class="p">,</span> <span class="n">dim</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span>
+    <span class="k">return</span> <span class="n">sorted_indices</span><span class="p">,</span> <span class="n">sorted_values</span><span class="p">,</span> <span class="n">bins</span><span class="p">,</span> <span class="n">tokens_per_expert</span>
+
+
+<span class="k">def</span><span class="w"> </span><span class="nf">binned_experts_ref</span><span class="p">(</span>
+    <span class="n">hidden_states</span><span class="p">,</span>
+    <span class="n">router_indices</span><span class="p">,</span>
+    <span class="n">routing_weights</span><span class="p">,</span>
+    <span class="n">gate_up_proj</span><span class="p">,</span>
+    <span class="n">gate_up_proj_bias</span><span class="p">,</span>
+    <span class="n">down_proj</span><span class="p">,</span>
+    <span class="n">down_proj_bias</span><span class="p">,</span>
+    <span class="n">expert_capacity</span><span class="p">,</span>
+<span class="p">):</span>
+    <span class="n">B</span><span class="p">,</span> <span class="n">S</span><span class="p">,</span> <span class="n">H</span> <span class="o">=</span> <span class="n">hidden_states</span><span class="o">.</span><span class="n">shape</span>
+    <span class="n">E</span><span class="p">,</span> <span class="n">K</span> <span class="o">=</span> <span class="n">routing_weights</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">2</span><span class="p">],</span> <span class="n">router_indices</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span>
+
+    <span class="n">indices</span><span class="p">,</span> <span class="n">_</span><span class="p">,</span> <span class="n">bins</span><span class="p">,</span> <span class="n">_</span> <span class="o">=</span> <span class="n">sort_tokens_by_expert</span><span class="p">(</span><span class="n">router_indices</span><span class="p">,</span> <span class="n">E</span><span class="p">)</span>
+    <span class="n">x</span> <span class="o">=</span> <span class="n">binned_gather</span><span class="p">(</span><span class="n">hidden_states</span><span class="o">.</span><span class="n">view</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="n">H</span><span class="p">),</span> <span class="n">indices</span><span class="p">,</span> <span class="n">bins</span><span class="p">,</span> <span class="n">expert_capacity</span><span class="p">,</span> <span class="n">K</span><span class="p">)</span>
+
+    <span class="n">gate_up</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">bmm</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">gate_up_proj</span><span class="p">)</span> <span class="o">+</span> <span class="n">gate_up_proj_bias</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="kc">None</span><span class="p">,</span> <span class="p">:]</span>
+    <span class="n">gate</span><span class="p">,</span> <span class="n">up</span> <span class="o">=</span> <span class="n">gate_up</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="p">::</span><span class="mi">2</span><span class="p">],</span> <span class="n">gate_up</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="mi">1</span><span class="p">::</span><span class="mi">2</span><span class="p">]</span>
+
+    <span class="c1"># clamp to limit</span>
+    <span class="n">limit</span> <span class="o">=</span> <span class="mf">7.0</span>
+    <span class="n">gate</span> <span class="o">=</span> <span class="n">gate</span><span class="o">.</span><span class="n">clamp</span><span class="p">(</span><span class="nb">min</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="nb">max</span><span class="o">=</span><span class="n">limit</span><span class="p">)</span>
+    <span class="n">up</span> <span class="o">=</span> <span class="n">up</span><span class="o">.</span><span class="n">clamp</span><span class="p">(</span><span class="nb">min</span><span class="o">=-</span><span class="n">limit</span><span class="p">,</span> <span class="nb">max</span><span class="o">=</span><span class="n">limit</span><span class="p">)</span>
+
+    <span class="n">glu</span> <span class="o">=</span> <span class="n">gate</span> <span class="o">*</span> <span class="n">torch</span><span class="o">.</span><span class="n">sigmoid</span><span class="p">(</span><span class="n">gate</span> <span class="o">*</span> <span class="mf">1.702</span><span class="p">)</span>
+    <span class="n">x</span> <span class="o">=</span> <span class="p">(</span><span class="n">up</span> <span class="o">+</span> <span class="mi">1</span><span class="p">)</span> <span class="o">*</span> <span class="n">glu</span>
+    <span class="n">x</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">bmm</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">down_proj</span><span class="p">)</span> <span class="o">+</span> <span class="n">down_proj_bias</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="kc">None</span><span class="p">,</span> <span class="p">:]</span>
+
+    <span class="c1"># build routing weights aligned to (token, slot)</span>
+    <span class="n">flat_dense</span> <span class="o">=</span> <span class="n">routing_weights</span><span class="o">.</span><span class="n">view</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="n">E</span><span class="p">)</span>  <span class="c1"># [B*S, E]</span>
+    <span class="n">flat_router</span> <span class="o">=</span> <span class="n">router_indices</span><span class="o">.</span><span class="n">view</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="n">K</span><span class="p">)</span>  <span class="c1"># [B*S, K]</span>
+    <span class="n">selected</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">gather</span><span class="p">(</span><span class="n">flat_dense</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="n">flat_router</span><span class="p">)</span><span class="o">.</span><span class="n">reshape</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span>  <span class="c1"># [B*S*K]</span>
+
+    <span class="c1"># scatter back</span>
+    <span class="n">y</span> <span class="o">=</span> <span class="n">binned_scatter</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">indices</span><span class="p">,</span> <span class="n">selected</span><span class="p">,</span> <span class="n">bins</span><span class="p">,</span> <span class="n">expert_capacity</span><span class="p">,</span> <span class="n">K</span><span class="p">)</span>  <span class="c1"># [B*S, H]</span>
+
+    <span class="k">return</span> <span class="n">y</span><span class="o">.</span><span class="n">view</span><span class="p">(</span><span class="n">B</span><span class="p">,</span> <span class="n">S</span><span class="p">,</span> <span class="n">H</span><span class="p">)</span>
+
+
+<span class="k">def</span><span class="w"> </span><span class="nf">binned_torch_openai_moe</span><span class="p">(</span>
+    <span class="n">hidden_states</span><span class="p">,</span>
+    <span class="n">router_indices</span><span class="p">,</span>
+    <span class="n">routing_weights</span><span class="p">,</span>
+    <span class="n">gate_up_proj</span><span class="p">,</span>
+    <span class="n">gate_up_proj_bias</span><span class="p">,</span>
+    <span class="n">down_proj</span><span class="p">,</span>
+    <span class="n">down_proj_bias</span><span class="p">,</span>
+<span class="p">):</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Binned PyTorch implementation of OpenAI-style MoE.</span>
+<span class="sd">    Sorts tokens by expert assignment for more efficient batched processing.</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+    <span class="n">B</span><span class="p">,</span> <span class="n">S</span> <span class="o">=</span> <span class="n">hidden_states</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">hidden_states</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span>
+    <span class="n">K</span> <span class="o">=</span> <span class="n">router_indices</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span>
+
+    <span class="c1"># Set expert_capacity to a reasonable value (max tokens per expert)</span>
+    <span class="c1"># Use 2x the average to handle imbalance</span>
+    <span class="n">expert_capacity</span> <span class="o">=</span> <span class="p">(</span><span class="n">B</span> <span class="o">*</span> <span class="n">S</span> <span class="o">*</span> <span class="n">K</span> <span class="o">*</span> <span class="mi">2</span><span class="p">)</span> <span class="o">//</span> <span class="n">routing_weights</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">2</span><span class="p">]</span>
+
+    <span class="k">return</span> <span class="n">binned_experts_ref</span><span class="p">(</span>
+        <span class="n">hidden_states</span><span class="p">,</span>
+        <span class="n">router_indices</span><span class="p">,</span>
+        <span class="n">routing_weights</span><span class="p">,</span>
+        <span class="n">gate_up_proj</span><span class="p">,</span>
+        <span class="n">gate_up_proj_bias</span><span class="p">,</span>
+        <span class="n">down_proj</span><span class="p">,</span>
+        <span class="n">down_proj_bias</span><span class="p">,</span>
+        <span class="n">expert_capacity</span><span class="p">,</span>
+    <span class="p">)</span>
+
+
+<span class="n">run_benchmark</span><span class="p">(</span>
+    <span class="n">kernel_type</span><span class="o">=</span><span class="n">KernelTypeEnum</span><span class="o">.</span><span class="n">OPENAI_MOE</span><span class="p">,</span>
+    <span class="n">impl_name</span><span class="o">=</span><span class="s2">&quot;binned_torch&quot;</span><span class="p">,</span>
+    <span class="n">impl_tags</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;family&quot;</span><span class="p">:</span> <span class="s2">&quot;pytorch&quot;</span><span class="p">,</span> <span class="s2">&quot;backend&quot;</span><span class="p">:</span> <span class="s2">&quot;eager&quot;</span><span class="p">},</span>
+    <span class="n">impl_func</span><span class="o">=</span><span class="n">binned_torch_openai_moe</span><span class="p">,</span>
+    <span class="n">dtype</span><span class="o">=</span><span class="s2">&quot;float32&quot;</span><span class="p">,</span>
+<span class="p">)</span>
+</pre></div>
+
+<div class="code-line-highlight" id="line-highlight-benchmark"></div>
+</div>
+</div>
+<div id="output-benchmark" class="cell-output">
+<div class="cell-stdout"><pre class="stdout-text">Running openai_moe benchmark on cuda with 8 workloads.
+
+======================================================================
+PROFILE TRACE: binned_torch | cuda_B1_S512_E2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                           binned_torch         0.00%       0.000us         0.00%       0.000us       0.000us     906.550ms      1808.50%     906.550ms     906.550ms             1  
+                                           binned_torch        25.29%     229.728ms       100.00%     908.308ms     908.308ms       0.000us         0.00%      50.129ms      50.129ms             1  
+                                             aten::item         1.81%      16.434ms        25.66%     233.033ms      15.186us       0.000us         0.00%      15.809ms       1.030us         15345  
+                              aten::_local_scalar_dense         6.08%      55.189ms        23.85%     216.599ms      14.115us      15.808ms        31.54%      15.809ms       1.030us         15345  
+                         Memcpy DtoH (Device -&gt; Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us      15.808ms        31.54%      15.808ms       1.030us         15345  
+                                              aten::bmm         0.02%     187.925us         0.02%     226.636us      37.773us       7.688ms        15.34%       7.688ms       1.281ms             6  
+                                ampere_sgemm_128x128_nn         0.00%       0.000us         0.00%       0.000us       0.000us       7.688ms        15.34%       7.688ms       1.281ms             6  
+                                     aten::floor_divide         5.37%      48.789ms        13.13%     119.247ms      19.409us       7.554ms        15.07%       7.554ms       1.230us          6144  
+                                            aten::copy_         3.71%      33.699ms         9.08%      82.451ms      13.394us       6.606ms        13.18%       6.607ms       1.073us          6156  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.602ms        13.17%       6.602ms       1.073us          6153  
+                                              aten::mul         3.08%      27.972ms         5.49%      49.893ms      16.194us       4.718ms         9.41%       4.718ms       1.531us          3081  
+void at::native::elementwise_kernel&lt;128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       4.471ms         8.92%       4.471ms       1.456us          3072  
+void at::native::vectorized_elementwise_kernel&lt;2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       4.032ms         8.04%       4.032ms       1.312us          3072  
+                                        aten::remainder         3.03%      27.567ms         4.66%      42.309ms      13.772us       3.722ms         7.42%       3.722ms       1.212us          3072  
+                                              aten::add         2.91%      26.436ms         4.87%      44.207ms      14.575us       3.546ms         7.07%       3.546ms       1.169us          3033  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       3.524ms         7.03%       3.524ms       1.147us          3072  
+void at::native::vectorized_elementwise_kernel&lt;2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       3.156ms         6.30%       3.156ms       1.042us          3030  
+void at::native::vectorized_elementwise_kernel&lt;2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       1.964ms         3.92%       1.964ms       1.279us          1536  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       1.758ms         3.51%       1.758ms       1.145us          1536  
+void at::native::elementwise_kernel&lt;128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     286.305us         0.57%     286.305us      47.718us             6  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 908.315ms
+Self CUDA time total: 50.127ms
+
+
+
+======================================================================
+PROFILE TRACE: binned_torch | cuda_B1_S512_E4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                           binned_torch         0.00%       0.000us         0.00%       0.000us       0.000us     939.657ms      1760.51%     939.657ms     939.657ms             1  
+                                           binned_torch        24.72%     232.366ms       100.00%     940.175ms     940.175ms       0.000us         0.00%      53.379ms      53.379ms             1  
+                                             aten::item         1.65%      15.471ms        26.56%     249.752ms      14.748us       0.000us         0.00%      17.339ms       1.024us         16935  
+                              aten::_local_scalar_dense         6.16%      57.893ms        24.92%     234.282ms      13.834us      17.337ms        32.48%      17.339ms       1.024us         16935  
+                         Memcpy DtoH (Device -&gt; Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us      17.337ms        32.48%      17.337ms       1.024us         16935  
+                                              aten::bmm         0.02%     191.684us         0.02%     230.777us      38.463us       7.882ms        14.77%       7.882ms       1.314ms             6  
+                                ampere_sgemm_128x128_nn         0.00%       0.000us         0.00%       0.000us       0.000us       7.882ms        14.77%       7.882ms       1.314ms             6  
+                                     aten::floor_divide         5.10%      47.974ms        12.37%     116.337ms      18.935us       7.540ms        14.13%       7.541ms       1.227us          6144  
+                                            aten::copy_         3.80%      35.738ms         9.00%      84.586ms      13.740us       6.593ms        12.35%       6.595ms       1.071us          6156  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.590ms        12.35%       6.590ms       1.071us          6153  
+                                              aten::add         4.16%      39.066ms         7.01%      65.874ms      14.342us       5.113ms         9.58%       5.113ms       1.113us          4593  
+                                              aten::mul         2.92%      27.472ms         5.20%      48.883ms      15.866us       4.715ms         8.83%       4.715ms       1.530us          3081  
+void at::native::elementwise_kernel&lt;128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       4.472ms         8.38%       4.472ms       1.456us          3072  
+void at::native::vectorized_elementwise_kernel&lt;2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       4.021ms         7.53%       4.021ms       1.309us          3072  
+                                        aten::remainder         2.73%      25.664ms         4.27%      40.147ms      13.069us       3.707ms         6.95%       3.707ms       1.207us          3072  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       3.519ms         6.59%       3.519ms       1.146us          3072  
+void at::native::vectorized_elementwise_kernel&lt;2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       3.178ms         5.95%       3.178ms       1.049us          3030  
+void at::native::vectorized_elementwise_kernel&lt;2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       1.958ms         3.67%       1.958ms       1.275us          1536  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       1.749ms         3.28%       1.749ms       1.139us          1536  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       1.537ms         2.88%       1.537ms       0.985us          1560  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 940.182ms
+Self CUDA time total: 53.374ms
+
+
+
+======================================================================
+PROFILE TRACE: binned_torch | cuda_B1_S1024_E2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                           binned_torch         0.00%       0.000us         0.00%       0.000us       0.000us        1.751s      1703.41%        1.751s        1.751s             1  
+                                           binned_torch        24.63%     431.727ms       100.00%        1.753s        1.753s       0.000us         0.00%     102.829ms     102.829ms             1  
+                                             aten::item         1.69%      29.621ms        25.96%     455.095ms      14.915us       0.000us         0.00%      31.387ms       1.029us         30513  
+                              aten::_local_scalar_dense         5.96%     104.552ms        24.27%     425.474ms      13.944us      31.383ms        30.52%      31.387ms       1.029us         30513  
+                         Memcpy DtoH (Device -&gt; Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us      31.383ms        30.52%      31.383ms       1.029us         30513  
+                                              aten::bmm         0.01%     224.614us         0.02%     267.595us      44.599us      15.143ms        14.73%      15.143ms       2.524ms             6  
+                                ampere_sgemm_128x128_nn         0.00%       0.000us         0.00%       0.000us       0.000us      15.143ms        14.73%      15.143ms       2.524ms             6  
+                                     aten::floor_divide         5.56%      97.549ms        13.34%     233.779ms      19.025us      15.089ms        14.68%      15.090ms       1.228us         12288  
+                                            aten::copy_         4.01%      70.283ms         9.47%     166.011ms      13.497us      13.317ms        12.95%      13.317ms       1.083us         12300  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      13.313ms        12.95%      13.313ms       1.083us         12294  
+                                              aten::mul         3.14%      55.060ms         5.66%      99.236ms      16.128us      11.295ms        10.99%      11.297ms       1.836us          6153  
+void at::native::elementwise_kernel&lt;128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       9.940ms         9.67%       9.940ms       1.618us          6144  
+void at::native::vectorized_elementwise_kernel&lt;2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       8.059ms         7.84%       8.059ms       1.312us          6144  
+                                              aten::add         2.85%      49.952ms         4.90%      85.866ms      14.522us       7.505ms         7.30%       7.506ms       1.269us          5913  
+                                        aten::remainder         3.02%      53.015ms         4.74%      83.117ms      13.528us       7.414ms         7.21%       7.416ms       1.207us          6144  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       7.031ms         6.84%       7.031ms       1.144us          6144  
+void at::native::vectorized_elementwise_kernel&lt;2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       6.224ms         6.05%       6.224ms       1.053us          5910  
+void at::native::vectorized_elementwise_kernel&lt;2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       3.914ms         3.81%       3.914ms       1.274us          3072  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       3.500ms         3.40%       3.500ms       1.139us          3072  
+                                            aten::clamp         0.00%      71.603us         0.01%     117.833us      19.639us       1.180ms         1.15%       1.180ms     196.722us             6  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.753s
+Self CUDA time total: 102.819ms
+
+
+
+======================================================================
+PROFILE TRACE: binned_torch | cuda_B1_S1024_E4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                           binned_torch         0.00%       0.000us         0.00%       0.000us       0.000us        1.834s      1680.90%        1.834s        1.834s             1  
+                                           binned_torch        24.76%     454.393ms       100.00%        1.835s        1.835s       0.000us         0.00%     109.119ms     109.119ms             1  
+                                             aten::item         1.65%      30.229ms        26.42%     484.819ms      14.374us       0.000us         0.00%      34.734ms       1.030us         33729  
+                              aten::_local_scalar_dense         6.08%     111.551ms        24.77%     454.590ms      13.478us      34.731ms        31.83%      34.734ms       1.030us         33729  
+                         Memcpy DtoH (Device -&gt; Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us      34.731ms        31.83%      34.731ms       1.030us         33729  
+                                              aten::bmm         0.01%     219.836us         0.01%     260.868us      43.478us      15.243ms        13.97%      15.243ms       2.540ms             6  
+                                ampere_sgemm_128x128_nn         0.00%       0.000us         0.00%       0.000us       0.000us      15.243ms        13.97%      15.243ms       2.540ms             6  
+                                     aten::floor_divide         5.37%      98.619ms        12.62%     231.581ms      18.846us      15.065ms        13.81%      15.065ms       1.226us         12288  
+                                            aten::copy_         3.65%      66.986ms         8.64%     158.623ms      12.896us      13.313ms        12.20%      13.316ms       1.083us         12300  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      13.309ms        12.20%      13.309ms       1.082us         12297  
+                                              aten::mul         2.96%      54.365ms         5.27%      96.616ms      15.702us      10.967ms        10.05%      10.969ms       1.783us          6153  
+                                              aten::add         4.05%      74.247ms         6.97%     127.934ms      14.060us      10.631ms         9.74%      10.631ms       1.168us          9099  
+void at::native::elementwise_kernel&lt;128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       9.613ms         8.81%       9.613ms       1.565us          6144  
+void at::native::vectorized_elementwise_kernel&lt;2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       8.047ms         7.37%       8.047ms       1.310us          6144  
+                                        aten::remainder         2.81%      51.641ms         4.37%      80.193ms      13.052us       7.438ms         6.82%       7.438ms       1.211us          6144  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       7.018ms         6.43%       7.018ms       1.142us          6144  
+void at::native::vectorized_elementwise_kernel&lt;2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       6.225ms         5.71%       6.225ms       1.053us          5910  
+void at::native::vectorized_elementwise_kernel&lt;2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       3.928ms         3.60%       3.928ms       1.279us          3072  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       3.510ms         3.22%       3.510ms       1.143us          3072  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       3.154ms         2.89%       3.154ms       0.990us          3186  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.835s
+Self CUDA time total: 109.111ms
+
+
+
+======================================================================
+PROFILE TRACE: binned_torch | cuda_B4_S512_E2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                           binned_torch         0.00%       0.000us         0.00%       0.000us       0.000us        3.518s      1672.53%        3.518s        3.518s             1  
+                                           binned_torch        24.37%     858.118ms       100.00%        3.521s        3.521s       0.000us         0.00%     210.357ms     210.357ms             1  
+                         Memcpy DtoH (Device -&gt; Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us      63.177ms        30.04%      63.177ms       1.026us         61586  
+                                             aten::item         1.69%      59.432ms        26.02%     916.275ms      14.878us       0.000us         0.00%      63.177ms       1.026us         61587  
+                              aten::_local_scalar_dense         5.96%     209.806ms        24.34%     856.843ms      13.913us      63.176ms        30.03%      63.177ms       1.026us         61587  
+                                     aten::floor_divide         5.42%     190.698ms        13.50%     475.217ms      19.337us      30.482ms        14.49%      30.486ms       1.240us         24576  
+                                              aten::bmm         0.01%     235.397us         0.01%     281.998us      47.000us      29.291ms        13.93%      29.291ms       4.882ms             6  
+                                ampere_sgemm_128x128_nn         0.00%       0.000us         0.00%       0.000us       0.000us      29.291ms        13.93%      29.291ms       4.882ms             6  
+                                            aten::copy_         3.77%     132.744ms         9.15%     322.282ms      13.107us      26.808ms        12.75%      26.810ms       1.090us         24588  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      26.805ms        12.74%      26.805ms       1.090us         24582  
+                                              aten::mul         3.15%     110.895ms         5.78%     203.457ms      16.545us      25.566ms        12.15%      25.568ms       2.079us         12297  
+void at::native::elementwise_kernel&lt;128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.101ms        10.51%      22.101ms       1.799us         12288  
+void at::native::vectorized_elementwise_kernel&lt;2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.470ms         7.83%      16.470ms       1.340us         12288  
+                                              aten::add         2.99%     105.439ms         5.15%     181.211ms      14.601us      16.115ms         7.66%      16.116ms       1.298us         12411  
+                                        aten::remainder         2.99%     105.111ms         4.72%     166.195ms      13.525us      14.836ms         7.05%      14.838ms       1.208us         12288  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      14.014ms         6.66%      14.014ms       1.140us         12288  
+void at::native::vectorized_elementwise_kernel&lt;2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      12.996ms         6.18%      12.996ms       1.047us         12408  
+void at::native::vectorized_elementwise_kernel&lt;2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       7.830ms         3.72%       7.830ms       1.274us          6144  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       7.006ms         3.33%       7.006ms       1.140us          6144  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.626ms         1.25%       2.626ms     437.595us             6  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 3.521s
+Self CUDA time total: 210.342ms
+
+
+
+======================================================================
+PROFILE TRACE: binned_torch | cuda_B4_S512_E4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                           binned_torch         0.00%       0.000us         0.00%       0.000us       0.000us        3.742s      1679.57%        3.742s        3.742s             1  
+                                           binned_torch        24.42%     914.204ms       100.00%        3.744s        3.744s       0.000us         0.00%     222.834ms     222.834ms             1  
+                                             aten::item         1.73%      64.729ms        26.53%     993.125ms      14.638us       0.000us         0.00%      69.848ms       1.030us         67845  
+                              aten::_local_scalar_dense         6.14%     229.850ms        24.80%     928.396ms      13.684us      69.844ms        31.35%      69.848ms       1.030us         67845  
+                         Memcpy DtoH (Device -&gt; Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us      69.844ms        31.35%      69.844ms       1.030us         67841  
+                                     aten::floor_divide         5.29%     197.931ms        12.52%     468.921ms      19.080us      30.509ms        13.69%      30.515ms       1.242us         24576  
+                                ampere_sgemm_128x128_nn         0.00%       0.000us         0.00%       0.000us       0.000us      29.140ms        13.08%      29.140ms       4.857ms             6  
+                                              aten::bmm         0.01%     232.675us         0.01%     273.538us      45.590us      29.140ms        13.08%      29.140ms       4.857ms             6  
+                                            aten::copy_         3.66%     136.881ms         8.73%     326.908ms      13.295us      26.646ms        11.96%      26.647ms       1.084us         24588  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      26.643ms        11.96%      26.643ms       1.084us         24581  
+                                              aten::mul         2.96%     110.832ms         5.24%     196.253ms      15.959us      25.520ms        11.45%      25.522ms       2.075us         12297  
+                                              aten::add         4.16%     155.619ms         7.13%     266.948ms      14.322us      22.169ms         9.95%      22.169ms       1.189us         18639  
+void at::native::elementwise_kernel&lt;128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.076ms         9.91%      22.076ms       1.797us         12288  
+void at::native::vectorized_elementwise_kernel&lt;2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.462ms         7.39%      16.462ms       1.340us         12287  
+                                        aten::remainder         2.77%     103.887ms         4.33%     162.240ms      13.203us      14.877ms         6.68%      14.879ms       1.211us         12288  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      14.047ms         6.30%      14.047ms       1.143us         12287  
+void at::native::vectorized_elementwise_kernel&lt;2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      12.957ms         5.82%      12.957ms       1.044us         12407  
+void at::native::vectorized_elementwise_kernel&lt;2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       7.856ms         3.53%       7.856ms       1.279us          6144  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       7.021ms         3.15%       7.021ms       1.143us          6144  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.109ms         2.74%       6.109ms       0.981us          6228  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 3.744s
+Self CUDA time total: 222.814ms
+
+
+
+======================================================================
+PROFILE TRACE: binned_torch | cuda_B4_S1024_E2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                           binned_torch         0.00%       0.000us         0.00%       0.000us       0.000us        6.967s      1665.27%        6.967s        6.967s             1  
+                                           binned_torch        24.68%        1.721s       100.00%        6.973s        6.973s       0.000us         0.00%     418.392ms     418.392ms             1  
+                                             aten::item         1.64%     114.231ms        25.94%        1.809s      14.732us       0.000us         0.00%     125.163ms       1.020us        122763  
+                              aten::_local_scalar_dense         5.97%     416.624ms        24.30%        1.694s      13.802us     125.151ms        29.91%     125.163ms       1.020us        122763  
+                         Memcpy DtoH (Device -&gt; Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us     125.151ms        29.91%     125.151ms       1.019us        122762  
+                                     aten::floor_divide         5.62%     391.846ms        13.33%     929.253ms      18.906us      61.051ms        14.59%      61.053ms       1.242us         49152  
+                                ampere_sgemm_128x128_nn         0.00%       0.000us         0.00%       0.000us       0.000us      57.281ms        13.69%      57.281ms       9.547ms             6  
+                                              aten::bmm         0.00%     234.996us         0.00%     276.787us      46.131us      57.281ms        13.69%      57.281ms       9.547ms             6  
+                                            aten::copy_         3.92%     273.517ms         9.35%     652.240ms      13.268us      53.435ms        12.77%      53.437ms       1.087us         49158  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.433ms        12.77%      53.433ms       1.087us         49154  
+                                              aten::mul         3.15%     219.950ms         5.62%     391.612ms      15.929us      51.411ms        12.29%      51.419ms       2.091us         24585  
+void at::native::elementwise_kernel&lt;128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      44.451ms        10.62%      44.451ms       1.809us         24576  
+void at::native::vectorized_elementwise_kernel&lt;2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      32.993ms         7.89%      32.993ms       1.343us         24576  
+                                              aten::add         2.87%     200.428ms         4.94%     344.166ms      14.085us      31.887ms         7.62%      31.889ms       1.305us         24435  
+                                        aten::remainder         3.00%     208.953ms         4.67%     325.902ms      13.261us      29.680ms         7.09%      29.684ms       1.208us         24576  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      28.059ms         6.71%      28.059ms       1.142us         24576  
+void at::native::vectorized_elementwise_kernel&lt;2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      25.247ms         6.03%      25.247ms       1.033us         24431  
+void at::native::vectorized_elementwise_kernel&lt;2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      15.667ms         3.74%      15.667ms       1.275us         12288  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      14.014ms         3.35%      14.014ms       1.140us         12288  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       5.233ms         1.25%       5.233ms     872.184us             6  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 6.973s
+Self CUDA time total: 418.361ms
+
+
+
+======================================================================
+PROFILE TRACE: binned_torch | cuda_B4_S1024_E4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                           binned_torch         0.00%       0.000us         0.00%       0.000us       0.000us        7.368s      1660.72%        7.368s        7.368s             1  
+                                           binned_torch        24.39%        1.797s       100.00%        7.370s        7.370s       0.000us         0.00%     443.698ms     443.698ms             1  
+                                             aten::item         1.69%     124.742ms        26.51%        1.954s      14.504us       0.000us         0.00%     137.717ms       1.022us        134715  
+                              aten::_local_scalar_dense         6.11%     450.407ms        24.82%        1.829s      13.577us     137.708ms        31.04%     137.717ms       1.022us        134715  
+                         Memcpy DtoH (Device -&gt; Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us     137.710ms        31.04%     137.710ms       1.022us        134711  
+                                     aten::floor_divide         5.42%     399.563ms        12.65%     932.414ms      18.970us      61.071ms        13.77%      61.077ms       1.243us         49152  
+                                              aten::bmm         0.00%     230.664us         0.00%     272.466us      45.411us      57.304ms        12.92%      57.304ms       9.551ms             6  
+                                ampere_sgemm_128x128_nn         0.00%       0.000us         0.00%       0.000us       0.000us      57.304ms        12.92%      57.304ms       9.551ms             6  
+                                            aten::copy_         3.65%     269.132ms         8.67%     639.259ms      13.004us      54.065ms        12.19%      54.067ms       1.100us         49158  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      54.062ms        12.19%      54.062ms       1.100us         49153  
+                                              aten::mul         2.96%     217.959ms         5.26%     387.551ms      15.764us      51.653ms        11.64%      51.660ms       2.101us         24585  
+void at::native::elementwise_kernel&lt;128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      44.653ms        10.06%      44.653ms       1.817us         24576  
+                                              aten::add         4.03%     296.962ms         6.96%     512.647ms      14.100us      43.690ms         9.85%      43.694ms       1.202us         36357  
+void at::native::vectorized_elementwise_kernel&lt;2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      32.954ms         7.43%      32.954ms       1.341us         24575  
+                                        aten::remainder         2.83%     208.527ms         4.40%     323.906ms      13.180us      29.662ms         6.69%      29.664ms       1.207us         24576  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      28.119ms         6.34%      28.119ms       1.144us         24576  
+void at::native::vectorized_elementwise_kernel&lt;2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      25.409ms         5.73%      25.409ms       1.040us         24431  
+void at::native::vectorized_elementwise_kernel&lt;2, at...         0.00%       0.000us         0.00%       0.000us       0.000us      15.666ms         3.53%      15.666ms       1.275us         12288  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      13.995ms         3.15%      13.995ms       1.139us         12288  
+void at::native::unrolled_elementwise_kernel&lt;at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      11.644ms         2.62%      11.644ms       0.977us         11922  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 7.370s
+Self CUDA time total: 443.660ms
+
+
+impl                     wl                  p50(ms)  ok
+binned_torch             cuda_B1_S1024_E2     372.79  True
+binned_torch             cuda_B1_S1024_E4     382.68  True
+binned_torch             cuda_B1_S512_E2      150.05  True
+binned_torch             cuda_B1_S512_E4      200.26  True
+binned_torch             cuda_B4_S1024_E2    1486.48  True
+binned_torch             cuda_B4_S1024_E4    1524.50  True
+binned_torch             cuda_B4_S512_E2      742.02  True
+binned_torch             cuda_B4_S512_E4      801.90  True
+</pre></div>
+<div class="cell-artifacts">
+<h4>Artifacts:</h4>
+<a href="artifacts/benchmark/openai_moe.jsonl" class="artifact" target="_blank">openai_moe.jsonl</a>
+</div>
+</div>
+</div>
+    </div>
+    
+</body>
+</html>
\ No newline at end of file
diff --git a/openai_moe/impls/cells/benchmark.py b/openai_moe/impls/cells/benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..1527168c6489fe70597cc9c4a6625c220d6a5e20
--- /dev/null
+++ b/openai_moe/impls/cells/benchmark.py
@@ -0,0 +1,136 @@
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "numpy",
+#     "torch==2.8.0",
+#     "kernels-benchmark-tools",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+
+
+def binned_gather(x, indices, bins, expert_capacity, top_k):
+    E, H = bins.shape[0], x.shape[1]
+    out = torch.zeros((E, expert_capacity, H), device=x.device, dtype=x.dtype)
+    for e in range(E):
+        start = 0 if e == 0 else bins[e - 1]
+        end = bins[e]
+        n = min(end - start, expert_capacity)
+        for i in range(n):
+            flat_pos = indices[start + i]
+            tok = flat_pos // top_k
+            out[e, i] = x[tok]
+    return out
+
+
+def binned_scatter(x, indices, weights, bins, expert_capacity, top_k):
+    E, C, H = x.shape
+    N = indices.shape[0] // top_k
+    out = torch.zeros((N, top_k, H), dtype=x.dtype, device=x.device)
+    for e in range(E):
+        start = 0 if e == 0 else bins[e - 1]
+        end = bins[e]
+        n = end - start
+        if n == 0:
+            continue
+        take = min(n, expert_capacity)
+        for i in range(take):
+            flat_pos = indices[start + i]  # flattened (token, slot)
+            tok = flat_pos // top_k
+            slot = flat_pos % top_k
+            scale = weights[flat_pos] if weights is not None else 1.0
+            out[tok, slot] = x[e, i] * scale
+    return out.sum(dim=1)
+
+
+def sort_tokens_by_expert(router_indices, num_experts):
+    flat_indices = router_indices.flatten()
+    sorted_values, sorted_indices = torch.sort(flat_indices)
+    tokens_per_expert = torch.bincount(sorted_values, minlength=num_experts)
+    bins = torch.cumsum(tokens_per_expert, dim=0)
+    return sorted_indices, sorted_values, bins, tokens_per_expert
+
+
+def binned_experts_ref(
+    hidden_states,
+    router_indices,
+    routing_weights,
+    gate_up_proj,
+    gate_up_proj_bias,
+    down_proj,
+    down_proj_bias,
+    expert_capacity,
+):
+    B, S, H = hidden_states.shape
+    E, K = routing_weights.shape[2], router_indices.shape[1]
+
+    indices, _, bins, _ = sort_tokens_by_expert(router_indices, E)
+    x = binned_gather(hidden_states.view(-1, H), indices, bins, expert_capacity, K)
+
+    gate_up = torch.bmm(x, gate_up_proj) + gate_up_proj_bias[..., None, :]
+    gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+
+    # clamp to limit
+    limit = 7.0
+    gate = gate.clamp(min=None, max=limit)
+    up = up.clamp(min=-limit, max=limit)
+
+    glu = gate * torch.sigmoid(gate * 1.702)
+    x = (up + 1) * glu
+    x = torch.bmm(x, down_proj) + down_proj_bias[..., None, :]
+
+    # build routing weights aligned to (token, slot)
+    flat_dense = routing_weights.view(-1, E)  # [B*S, E]
+    flat_router = router_indices.view(-1, K)  # [B*S, K]
+    selected = torch.gather(flat_dense, 1, flat_router).reshape(-1)  # [B*S*K]
+
+    # scatter back
+    y = binned_scatter(x, indices, selected, bins, expert_capacity, K)  # [B*S, H]
+
+    return y.view(B, S, H)
+
+
+def binned_torch_openai_moe(
+    hidden_states,
+    router_indices,
+    routing_weights,
+    gate_up_proj,
+    gate_up_proj_bias,
+    down_proj,
+    down_proj_bias,
+):
+    """
+    Binned PyTorch implementation of OpenAI-style MoE.
+    Sorts tokens by expert assignment for more efficient batched processing.
+    """
+    B, S = hidden_states.shape[0], hidden_states.shape[1]
+    K = router_indices.shape[1]
+
+    # Set expert_capacity to a reasonable value (max tokens per expert)
+    # Use 2x the average to handle imbalance
+    expert_capacity = (B * S * K * 2) // routing_weights.shape[2]
+
+    return binned_experts_ref(
+        hidden_states,
+        router_indices,
+        routing_weights,
+        gate_up_proj,
+        gate_up_proj_bias,
+        down_proj,
+        down_proj_bias,
+        expert_capacity,
+    )
+
+
+run_benchmark(
+    kernel_type=KernelTypeEnum.OPENAI_MOE,
+    impl_name="binned_torch",
+    impl_tags={"family": "pytorch", "backend": "eager"},
+    impl_func=binned_torch_openai_moe,
+    dtype="float32",
+)
\ No newline at end of file
diff --git a/openai_moe/impls/cells/nv.py b/openai_moe/impls/cells/nv.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9b2d5c1f9f3850dc1d3092240fd46853e7ed0c5
--- /dev/null
+++ b/openai_moe/impls/cells/nv.py
@@ -0,0 +1,2 @@
+import subprocess
+print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
\ No newline at end of file
diff --git a/openai_moe/impls/gpt_oss_moe.html b/openai_moe/impls/gpt_oss_moe.html
new file mode 100644
index 0000000000000000000000000000000000000000..2133ccac99c4f05bc3163b7f04c006955d4539b2
--- /dev/null
+++ b/openai_moe/impls/gpt_oss_moe.html
@@ -0,0 +1,4545 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>gpt_oss_moe</title>
+
+    <link rel="preconnect" href="https://fonts.googleapis.com">
+    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+    <link href="https://fonts.googleapis.com/css2?family=JetBrains+Mono:ital,wght@0,100..800;1,100..800&display=swap" rel="stylesheet">
+
+    <script>
+// Iframe-friendly navigation router
+        (function() {
+            const isIframe = window.self !== window.top;
+            if (!isIframe) return; // Only activate in iframe context
+
+            // On load: if hash points to a different page, navigate there
+            const hash = window.location.hash;
+            if (hash && hash.startsWith('#/')) {
+                const targetPath = hash.slice(2); // Remove '#/'
+                const currentPath = window.location.pathname.split('/').pop();
+
+                // Only navigate if we're not already on the target page
+                if (targetPath !== currentPath) {
+                    window.location.href = targetPath;
+                    return; // Stop execution, we're navigating away
+                }
+            }
+
+            // Intercept all link clicks for hash-based navigation
+            document.addEventListener('click', function(e) {
+                const link = e.target.closest('a');
+                if (!link) return;
+
+                const href = link.getAttribute('href');
+
+                // Skip external links, anchors, and javascript: links
+                if (!href || href.startsWith('#') || href.startsWith('http') || href.startsWith('javascript:')) {
+                    return;
+                }
+
+                e.preventDefault();
+
+                // Convert relative/absolute path to hash-based navigation
+                const url = new URL(href, window.location.href);
+                let fullPath = url.pathname;
+
+                // Remove leading slash if present for cleaner paths
+                if (fullPath.startsWith('/')) {
+                    fullPath = fullPath.slice(1);
+                }
+
+                // Update parent URL hash
+                window.location.hash = '#/' + fullPath;
+
+                // For HTML files, navigate within iframe
+                if (fullPath.endsWith('.html') || fullPath.endsWith('/')) {
+                    const pathParts = fullPath.split('/').filter(p => p);
+                    const targetFile = pathParts[pathParts.length - 1] || 'index.html';
+                    window.location.href = targetFile;
+                } else {
+                    // For non-HTML files (raw .py, etc), open directly
+                    window.open(href, '_blank');
+                }
+            });
+        })();
+
+        // Apply theme and widget visibility immediately to prevent flicker
+        (function() {
+            const configTheme = 'dark';
+            const hasConfigUi = false;
+            const configUi = hasConfigUi ? 'None' : null;
+            const hasWidgetsConfig = false;
+            const widgetsOn = hasWidgetsConfig ? false : true;
+            let theme;
+            if (configTheme === 'auto') {
+                theme = window.matchMedia('(prefers-color-scheme: dark)').matches ? 'dark' : 'light';
+            } else {
+                theme = localStorage.getItem('uvnote-theme') || configTheme;
+            }
+            document.documentElement.setAttribute('data-theme', theme);
+
+            // Initialize UI theme (css theme)
+            let ui = hasConfigUi ? configUi : (localStorage.getItem('uvnote-ui') || 'default');
+            if (ui !== 'default' && ui !== 'none' && ui !== 'monocolor') { ui = 'default'; }
+            document.documentElement.setAttribute('data-ui', ui);
+
+            // Apply widgets visibility
+            document.documentElement.setAttribute('data-widgets', widgetsOn ? 'on' : 'off');
+        })();
+    </script>
+    <style>
+:root[data-theme="light"] {
+    --bg-primary: #ffffff;
+    --bg-secondary: #f6f8fa;
+    --bg-tertiary: #f8f9fa;
+    --bg-code: #f8f9fa;
+    --bg-error: #fdf2f2;
+    --bg-artifact: #e6f3ff;
+    --bg-artifact-hover: #d0e7ff;
+
+    --text-primary: #333;
+    --text-secondary: #656d76;
+    --text-error: #c53030;
+    --text-link: #0969da;
+
+    --border-primary: #e1e5e9;
+    --border-error: #e53e3e;
+    --border-cell-failed: #d73a49;
+
+    --shadow: rgba(0, 0, 0, 0.1);
+}
+
+:root[data-theme="dark"] {
+    --bg-primary: #0a0a0a;
+    --bg-secondary: #121212;
+    --bg-tertiary: #181818;
+    --bg-code: #0d0d0d;
+    --bg-error: #1a0f0f;
+    --bg-artifact: #151515;
+    --bg-artifact-hover: #1a1a1a;
+
+    --text-primary: #e0e0e0;
+    --text-secondary: #888888;
+    --text-error: #ff6b6b;
+    --text-link: #64b5f6;
+
+    --border-primary: #2a2a2a;
+    --border-error: #ff6b6b;
+    --border-cell-failed: #ff6b6b;
+
+    --shadow: rgba(255, 255, 255, 0.05);
+}
+
+/* Monocolor UI theme: black/white background, all text/borders single blue */
+:root[data-ui="monocolor"] {
+    --mono-color: #0a66ff;
+}
+
+:root[data-ui="monocolor"][data-theme="light"] {
+    --bg-primary: #ffffff;
+}
+
+:root[data-ui="monocolor"][data-theme="dark"] {
+    --bg-primary: #000000;
+}
+
+:root[data-ui="monocolor"] {
+    --bg-secondary: var(--bg-primary);
+    --bg-tertiary: var(--bg-primary);
+    --bg-code: var(--bg-primary);
+    --bg-error: var(--bg-primary);
+    --bg-artifact: var(--bg-primary);
+    --bg-artifact-hover: var(--bg-primary);
+
+    --text-primary: var(--mono-color);
+    --text-secondary: var(--mono-color);
+    --text-error: var(--mono-color);
+    --text-link: var(--mono-color);
+
+    --border-primary: var(--mono-color);
+    --border-error: var(--mono-color);
+    --border-cell-failed: var(--mono-color);
+
+    --shadow: none;
+}
+
+:root[data-ui="monocolor"] a {
+    color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .menu-button,
+:root[data-ui="monocolor"] .theme-toggle,
+:root[data-ui="monocolor"] .reset-toggle,
+:root[data-ui="monocolor"] .back-button {
+    background: var(--bg-primary);
+    color: var(--mono-color);
+    border-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .menu-button:hover,
+:root[data-ui="monocolor"] .theme-toggle:hover,
+:root[data-ui="monocolor"] .reset-toggle:hover,
+:root[data-ui="monocolor"] .back-button:hover {
+    background: var(--bg-primary);
+    color: var(--mono-color);
+    border-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .menu-dropdown {
+    background: var(--bg-primary);
+    border-color: var(--mono-color);
+    box-shadow: none;
+}
+
+:root[data-ui="monocolor"] .menu-item {
+    color: var(--mono-color);
+    border-bottom-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .system-info {
+    background: var(--bg-primary);
+    border-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .cell {
+    border-color: var(--mono-color);
+    background: var(--bg-primary);
+}
+
+:root[data-ui="monocolor"] .cell-header {
+    background: var(--bg-primary);
+    border-bottom-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .artifact {
+    background: var(--bg-primary);
+    border-color: var(--mono-color);
+    color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .artifact:hover {
+    background: var(--bg-primary);
+}
+
+:root[data-ui="monocolor"] .artifact-preview img,
+:root[data-ui="monocolor"] .artifact-preview svg {
+    border-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .status-widget {
+    background: var(--bg-primary);
+    border-color: var(--mono-color);
+    color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .minimap,
+:root[data-ui="monocolor"] .file-explorer,
+:root[data-ui="monocolor"] .tools-widget {
+    background: var(--bg-primary);
+    border-color: var(--mono-color);
+    color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .cell-code {
+    background: var(--bg-primary);
+    border-bottom-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .tools-title,
+:root[data-ui="monocolor"] .file-explorer-section-title,
+:root[data-ui="monocolor"] .minimap-title {
+    color: var(--mono-color);
+    border-bottom-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .tool-button {
+    background: var(--bg-primary);
+    border-color: var(--mono-color);
+    color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .tool-button.active {
+    border-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .file-explorer-item,
+:root[data-ui="monocolor"] .minimap-item {
+    color: var(--mono-color);
+}
+
+/* Force Pygments code to mono blue on mono bg */
+:root[data-ui="monocolor"] .highlight {
+    background: var(--bg-primary) !important;
+    color: var(--mono-color) !important;
+}
+
+:root[data-ui="monocolor"] .highlight *,
+:root[data-ui="monocolor"] .highlight .hll {
+    color: var(--mono-color) !important;
+    background: transparent !important;
+    border-color: var(--mono-color) !important;
+}
+
+/* Default code font + metrics (overridable via frontmatter) */
+:root {
+    --code-font-size: 0.95rem;
+    --code-line-height: 1.5;
+    --code-pad-y: 0.75rem;
+}
+
+/* Minimal UI theme overrides base variables for a flatter, 90s look */
+:root[data-ui="none"] {
+    --bg-primary: #ffffff;
+    --bg-secondary: transparent;
+    --bg-tertiary: transparent;
+    --bg-code: #f9f9f9;
+    --bg-error: #fff0f0;
+    --bg-artifact: #f0f7ff;
+    --bg-artifact-hover: #e5f1ff;
+
+    --text-primary: #000000;
+    --text-secondary: #222222;
+    --text-error: #a00000;
+    --text-link: #0000ee;
+
+    --border-primary: #cccccc;
+    --border-error: #cc0000;
+    --border-cell-failed: #cc0000;
+
+    --shadow: none;
+}
+
+html {
+    overscroll-behavior: none;
+}
+
+body {
+    font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace;
+    line-height: 1.4;
+    max-width: 1000px;
+    margin: 0 auto;
+    padding: 15px;
+    color: var(--text-primary);
+    background: var(--bg-primary);
+    transition: background-color 0.2s ease, color 0.2s ease;
+    overscroll-behavior: none;
+}
+
+/* Minimal "none" UI theme overrides */
+:root[data-ui="none"] body {
+    font-family: 'Times New Roman', Times, serif;
+    line-height: 1.5;
+    max-width: 860px;
+    padding: 12px;
+    background: #ffffff;
+    color: #000000;
+    transition: none;
+}
+
+/* Two panel layout removed */
+
+.controls {
+    position: fixed;
+    top: 20px;
+    right: 20px;
+    display: flex;
+    flex-direction: column;
+    align-items: flex-end;
+    gap: 0.25rem;
+    z-index: 1000;
+}
+
+.controls-buttons {
+    display: flex;
+    gap: 0.5rem;
+}
+
+.menu-button {
+    position: relative;
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    padding: 8px 12px;
+    border-radius: 2px;
+    color: var(--text-secondary);
+    cursor: pointer;
+    font-family: inherit;
+    font-size: 0.9rem;
+    user-select: none;
+}
+
+/* Keep default control styling when widgets are enabled, even in minimal UI */
+:root[data-ui="none"][data-widgets="on"] .menu-button,
+:root[data-ui="none"][data-widgets="on"] .theme-toggle,
+:root[data-ui="none"][data-widgets="on"] .reset-toggle,
+:root[data-ui="none"][data-widgets="on"] .back-button {
+    background: #f6f6f6;
+    border: 1px solid #cccccc;
+    color: #222222;
+}
+
+.menu-button:hover {
+    color: var(--text-primary);
+    background: var(--bg-tertiary);
+}
+
+/* Controls state indicator (top-right) */
+/* Status widget (bottom-right) */
+.status-widget {
+    position: fixed;
+    right: 20px;
+    bottom: 20px;
+    width: auto;
+    max-width: 260px;
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    border-radius: 2px;
+    padding: 6px 8px;
+    font-size: 0.8rem;
+    color: var(--text-secondary);
+    z-index: 100;
+}
+
+.status-widget strong {
+    color: var(--text-primary);
+}
+
+:root[data-ui="none"][data-widgets="on"] .status-widget {
+    background: #f6f6f6;
+    border-color: #ccc;
+    color: #222;
+}
+
+:root[data-ui="none"][data-widgets="on"] .menu-button:hover,
+:root[data-ui="none"][data-widgets="on"] .theme-toggle:hover,
+:root[data-ui="none"][data-widgets="on"] .reset-toggle:hover,
+:root[data-ui="none"][data-widgets="on"] .back-button:hover {
+    background: #ededed;
+    border-color: #bbbbbb;
+    color: #000000;
+}
+
+.menu-dropdown {
+    position: absolute;
+    top: 100%;
+    right: 0;
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    border-radius: 4px;
+    box-shadow: 0 4px 12px var(--shadow);
+    min-width: 160px;
+    opacity: 0;
+    visibility: hidden;
+    transform: translateY(-8px);
+    transition: all 0.2s ease;
+    z-index: 1001;
+    margin-top: 4px;
+}
+
+:root[data-ui="none"][data-widgets="on"] .menu-dropdown {
+    background: #ffffff;
+    border: 1px solid #cccccc;
+    box-shadow: none;
+}
+
+.menu-button.active .menu-dropdown {
+    opacity: 1;
+    visibility: visible;
+    transform: translateY(0);
+}
+
+.menu-item {
+    display: block;
+    padding: 8px 12px;
+    color: var(--text-secondary);
+    text-decoration: none;
+    font-size: 0.85rem;
+    border-bottom: 1px solid var(--border-primary);
+    cursor: pointer;
+}
+
+:root[data-ui="none"] .menu-item {
+    color: #000;
+    border-bottom: 1px solid #eee;
+}
+
+.menu-item:last-child {
+    border-bottom: none;
+}
+
+.menu-item:hover {
+    background: var(--bg-tertiary);
+    color: var(--text-primary);
+}
+
+.menu-checkbox {
+    display: inline-block;
+    width: 16px;
+    font-family: monospace;
+    color: var(--text-link);
+}
+
+.theme-toggle,
+.reset-toggle,
+.back-button {
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    padding: 8px 12px;
+    border-radius: 4px;
+    color: var(--text-secondary);
+    cursor: pointer;
+    font-family: inherit;
+    font-size: 0.9rem;
+    user-select: none;
+}
+
+.back-button {
+    text-decoration: none;
+    display: inline-block;
+}
+
+.theme-toggle:hover,
+.reset-toggle:hover,
+.back-button:hover {
+    color: var(--text-primary);
+    background: var(--bg-tertiary);
+}
+
+.system-info {
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    border-radius: 4px;
+    padding: 8px 12px;
+    margin-bottom: 16px;
+    font-size: 0.85em;
+    color: var(--text-secondary);
+}
+
+.system-info-header {
+    font-weight: 600;
+    color: var(--text-primary);
+    margin-bottom: 2px;
+}
+
+.system-info-content {
+    font-family: monospace;
+}
+
+.theme-toggle,
+.reset-toggle {
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    border-radius: 2px;
+    /* padding: 0.4rem 0.6rem; */
+    cursor: pointer;
+    font-family: inherit;
+    font-size: 0.8rem;
+    color: var(--text-secondary);
+    user-select: none;
+    transition: all 0.2s ease;
+    text-transform: lowercase;
+    letter-spacing: 0;
+}
+
+.theme-toggle:hover,
+.reset-toggle:hover {
+    background: var(--bg-tertiary);
+    border-color: var(--text-secondary);
+    color: var(--text-primary);
+}
+
+.minimap {
+    position: fixed;
+    bottom: 20px;
+    right: 20px;
+    width: 220px;
+    max-height: 400px;
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    border-radius: 2px;
+    padding: 0.5rem;
+    font-size: 0.7rem;
+    overflow-y: auto;
+    z-index: 100;
+    opacity: 0.9;
+    transition: opacity 0.2s ease;
+}
+
+/* Hide widgets and controls when disabled via frontmatter */
+:root[data-widgets="off"] .controls,
+:root[data-widgets="off"] .minimap,
+:root[data-widgets="off"] .file-explorer,
+:root[data-widgets="off"] .tools-widget,
+:root[data-widgets="off"] .status-widget {
+    display: none !important;
+}
+
+.file-explorer {
+    position: fixed;
+    bottom: 20px;
+    /* default; JS will stack */
+    right: 20px;
+    left: auto;
+    top: auto;
+    width: 220px;
+    max-height: 400px;
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    border-radius: 2px;
+    padding: 0.5rem;
+    font-size: 0.7rem;
+    overflow-y: auto;
+    z-index: 100;
+    opacity: 0.9;
+    transition: opacity 0.2s ease;
+}
+
+/* Drawing overlay */
+.draw-overlay {
+    position: fixed;
+    top: 0;
+    left: 0;
+    width: 100vw;
+    height: 100vh;
+    z-index: 80;
+    /* under widgets (100) and controls (1000) */
+    display: block;
+    pointer-events: none;
+    /* enabled only when a tool is active */
+}
+
+/* Tools widget */
+.tools-widget {
+    position: fixed;
+    bottom: 20px;
+    /* default; JS will stack */
+    right: 20px;
+    left: auto;
+    top: auto;
+    width: 220px;
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    border-radius: 2px;
+    padding: 0.5rem;
+    font-size: 0.7rem;
+    z-index: 100;
+    opacity: 0.95;
+}
+
+.tools-title {
+    font-weight: bold;
+    color: var(--text-secondary);
+    margin-bottom: 0.5rem;
+    padding-bottom: 0.25rem;
+    border-bottom: 1px solid var(--border-primary);
+    cursor: grab;
+    user-select: none;
+}
+
+.tools-row {
+    display: flex;
+    gap: 0.4rem;
+    flex-wrap: wrap;
+}
+
+.tool-button {
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-primary);
+    border-radius: 2px;
+    padding: 0.25rem 0.4rem;
+    cursor: pointer;
+    color: var(--text-secondary);
+    font-family: inherit;
+    font-size: 0.75rem;
+    user-select: none;
+}
+
+.tool-button:hover {
+    color: var(--text-primary);
+}
+
+.tool-button.active {
+    color: var(--text-primary);
+    border-color: var(--text-secondary);
+    background: var(--bg-secondary);
+}
+
+.minimap:hover,
+.file-explorer:hover {
+    opacity: 1;
+}
+
+.minimap-title {
+    font-weight: bold;
+    color: var(--text-secondary);
+    margin-bottom: 0.5rem;
+    padding-bottom: 0.25rem;
+    border-bottom: 1px solid var(--border-primary);
+    cursor: grab;
+    /* drag handle */
+    user-select: none;
+}
+
+.minimap-item {
+    display: block;
+    color: var(--text-secondary);
+    text-decoration: none;
+    padding: 0.15rem 0;
+    border-left: 2px solid transparent;
+    padding-left: 0.5rem;
+    transition: all 0.2s ease;
+    cursor: pointer;
+}
+
+.minimap-item:hover {
+    color: var(--text-primary);
+    border-left-color: var(--text-secondary);
+}
+
+.minimap-item.active {
+    color: var(--text-primary);
+    border-left-color: var(--text-link);
+}
+
+.minimap-heading {
+    font-weight: normal;
+}
+
+.minimap-heading.h1 {
+    padding-left: 0.5rem;
+}
+
+.minimap-heading.h2 {
+    padding-left: 1rem;
+}
+
+.minimap-heading.h3 {
+    padding-left: 1.5rem;
+}
+
+.minimap-heading.h4 {
+    padding-left: 2rem;
+}
+
+.minimap-heading.h5 {
+    padding-left: 2.5rem;
+}
+
+.minimap-heading.h6 {
+    padding-left: 3rem;
+}
+
+.minimap-cell {
+    color: var(--text-link);
+    opacity: 0.8;
+    font-style: italic;
+}
+
+.minimap-cell:hover {
+    opacity: 1;
+}
+
+.file-explorer-title {
+    font-weight: bold;
+    color: var(--text-secondary);
+    margin-bottom: 0.5rem;
+    padding-bottom: 0.25rem;
+    border-bottom: 1px solid var(--border-primary);
+    cursor: grab;
+    /* drag handle */
+    user-select: none;
+}
+
+.file-explorer-section {
+    margin-bottom: 0.75rem;
+}
+
+.file-explorer-section-title {
+    font-weight: bold;
+    color: var(--text-secondary);
+    font-size: 0.65rem;
+    margin-bottom: 0.25rem;
+    text-transform: uppercase;
+    letter-spacing: 0.5px;
+}
+
+.file-explorer-item {
+    display: block;
+    color: var(--text-secondary);
+    text-decoration: none;
+    padding: 0.1rem 0;
+    margin-left: 0.5rem;
+    transition: color 0.2s ease;
+    cursor: pointer;
+    font-family: monospace;
+}
+
+.file-explorer-item:hover {
+    color: var(--text-primary);
+}
+
+.file-explorer-item.script {
+    color: var(--text-link);
+}
+
+.file-explorer-item.artifact {
+    color: var(--text-secondary);
+    opacity: 0.8;
+}
+
+
+/* Hide widgets on smaller screens */
+@media (max-width: 768px) {
+
+    .minimap,
+    .file-explorer,
+    .tools-widget {
+        display: none;
+    }
+}
+
+.cell {
+    margin: 1rem 0;
+    border: 1px solid var(--border-primary);
+    border-radius: 2px;
+    overflow: hidden;
+    background: var(--bg-secondary);
+}
+
+:root[data-ui="none"] .cell {
+    margin: 1em 0;
+    border: none;
+    background: transparent;
+}
+
+.cell-header {
+    background: var(--bg-secondary);
+    padding: 0.5rem 1rem;
+    border-bottom: 1px solid var(--border-primary);
+    font-family: inherit;
+    font-size: 0.85rem;
+}
+
+:root[data-ui="none"] .cell-header {
+    background: transparent;
+    border: none;
+    padding: 0;
+    font-weight: bold;
+}
+
+:root[data-ui="none"] .cell-content {
+    padding: 0;
+}
+
+:root[data-ui="none"] .copy-button,
+:root[data-ui="none"] .collapse-indicators,
+:root[data-ui="none"] .cell-meta,
+:root[data-ui="none"] .cell-outputs-header {
+    display: none !important;
+}
+
+:root[data-ui="none"] pre,
+:root[data-ui="none"] code {
+    font-family: Menlo, Monaco, 'Courier New', monospace;
+}
+
+:root[data-ui="none"] .code-content pre {
+    background: #f9f9f9;
+    border: 1px solid #ddd;
+    padding: 8px;
+}
+
+:root[data-ui="none"] .output {
+    background: transparent;
+    border: none;
+    padding: 0.25em 0;
+}
+
+color: var(--text-secondary);
+cursor: pointer;
+user-select: none;
+transition: background-color 0.2s ease;
+}
+
+.cell-header:hover {
+    background: var(--bg-tertiary);
+}
+
+.collapse-indicators {
+    color: var(--text-secondary);
+    font-size: 0.8rem;
+    opacity: 0.7;
+}
+
+.collapse-indicators span:hover {
+    color: var(--text-primary);
+    opacity: 1;
+}
+
+.cell-code {
+    display: block;
+    background: var(--bg-code);
+}
+
+.cell-code.collapsed {
+    display: none;
+}
+
+.cell-code pre {
+    margin: 0;
+    padding: 0.75rem;
+    background: var(--bg-code);
+    overflow-x: auto;
+    color: var(--text-primary);
+}
+
+.cell-output {
+    padding: 0.75rem;
+    /* background: var(--bg-primary); */
+    background: var(--bg-secondary);
+}
+
+.cell-output.collapsed {
+    display: none;
+}
+
+.cell-stdout {
+    background: var(--bg-tertiary);
+    padding: 0.75rem;
+    border-radius: 1px;
+    /* margin: 0.25rem 0; */
+    font-family: inherit;
+    font-size: 0.9rem;
+    white-space: pre-wrap;
+    color: var(--text-primary);
+}
+
+.cell-stdout {
+    background: var(--bg-tertiary);
+    padding: 0.75rem;
+    border-radius: 1px;
+    font-family: inherit;
+    font-size: 0.9rem;
+    color: var(--text-primary);
+
+    /* key bits */
+    overflow: auto;
+    /* show scrollbars when needed */
+    max-width: 100%;
+    /* respects whatever layout width you give it */
+}
+
+.cell-stdout .stdout-text {
+    margin: 0;
+    /* reset pre default margin */
+    white-space: pre;
+    /* keep line breaks, NO wrapping */
+    display: inline-block;
+    /* shrink-to-content */
+    min-width: max-content;
+    /* allow very long lines to define intrinsic width */
+    font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
+    tab-size: 2;
+}
+
+.cell-stderr {
+    background: var(--bg-error);
+    border-left: 2px solid var(--border-error);
+    padding: 1rem;
+    margin: 0.5rem 0;
+    font-family: inherit;
+    font-size: 0.9rem;
+    color: var(--text-error);
+    white-space: pre-wrap;
+}
+
+.uv-install-logs {
+    margin: 0.5rem 0;
+}
+
+.uv-logs-header {
+    cursor: pointer;
+    padding: 0.75rem;
+    border-left: 3px solid var(--border-color);
+    font-family: inherit;
+    font-size: 0.85rem;
+    color: var(--text-secondary);
+    user-select: none;
+}
+
+.uv-logs-content {
+    background: var(--bg-secondary);
+    padding: 1rem;
+    border-left: 3px solid var(--border-color);
+    white-space: pre-wrap;
+    font-family: monospace;
+    font-size: 0.85rem;
+    color: var(--text-secondary);
+    overflow-x: auto;
+}
+
+.cell-artifacts {
+    margin: 1rem 0;
+}
+
+.cell-artifacts h4 {
+    margin: 0 0 0.5rem 0;
+    color: var(--text-secondary);
+    font-size: 0.9rem;
+}
+
+.artifact {
+    display: inline-block;
+    background: var(--bg-artifact);
+    padding: 0.25rem 0.5rem;
+    border-radius: 1px;
+    margin: 0.25rem 0.5rem 0.25rem 0;
+    font-family: inherit;
+    font-size: 0.8rem;
+    color: var(--text-link);
+    text-decoration: none;
+    transition: background-color 0.2s ease;
+    border: 1px solid var(--border-primary);
+}
+
+.artifact:hover {
+    background: var(--bg-artifact-hover);
+}
+
+.artifact-preview {
+    margin-top: 1rem;
+}
+
+.artifact-preview img {
+    max-width: 100%;
+    height: auto;
+    border: 1px solid var(--border-primary);
+    border-radius: 1px;
+}
+
+.artifact-preview svg {
+    max-width: 100%;
+    height: auto;
+    border: 1px solid var(--border-primary);
+    border-radius: 1px;
+    display: block;
+}
+
+/* Style SVG text elements */
+.artifact-preview svg g {
+    fill: var(--text-primary) !important;
+}
+
+/* Auto-theme SVG elements */
+.artifact-preview svg {
+    background: transparent;
+}
+
+/* Invert SVG images in dark mode */
+:root[data-theme="dark"] .artifact-preview img[src$=".svg"] {
+    filter: invert(0.9) hue-rotate(180deg);
+}
+
+/* Keep SVG images readable in monocolor mode */
+:root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] {
+    filter: none;
+}
+
+/* CSV table styling */
+.artifact-csv {
+    margin-top: 1rem;
+    overflow-x: auto;
+}
+
+.csv-table {
+    width: 100%;
+    border-collapse: collapse;
+    font-size: 0.9rem;
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    border-radius: 1px;
+}
+
+.csv-table th,
+.csv-table td {
+    padding: 0.5rem 0.75rem;
+    text-align: left;
+    border: 1px solid var(--border-primary);
+}
+
+.csv-table th {
+    background: var(--bg-tertiary);
+    font-weight: 600;
+    color: var(--text-primary);
+}
+
+.csv-table tbody tr:hover {
+    background: var(--bg-artifact-hover);
+}
+
+.artifact-csv-error {
+    margin-top: 1rem;
+    padding: 1rem;
+    background: var(--bg-error);
+    color: var(--text-error);
+    border: 1px solid var(--border-error);
+    border-radius: 1px;
+}
+
+.cell-failed {
+    border-color: var(--border-cell-failed);
+}
+
+.cell-failed .cell-header {
+    background: var(--bg-error);
+    color: var(--text-error);
+}
+
+.cell-commented {
+    opacity: 0.6;
+    border-style: dashed;
+}
+
+.cell-commented .cell-header {
+    background: var(--bg-secondary);
+    color: var(--text-secondary);
+    font-style: italic;
+}
+
+.run-btn {
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-primary);
+    padding: 2px 6px;
+    border-radius: 2px;
+    color: var(--text-secondary);
+    cursor: pointer;
+    font-size: 0.75em;
+    font-family: inherit;
+    margin-left: 4px;
+}
+
+.run-btn:hover {
+    color: var(--text-primary);
+    background: var(--bg-primary);
+}
+
+.run-btn:disabled {
+    opacity: 0.6;
+    cursor: not-allowed;
+}
+
+.copy-btn {
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-primary);
+    padding: 2px 6px;
+    border-radius: 2px;
+    color: var(--text-secondary);
+    cursor: pointer;
+    font-size: 0.75em;
+    font-family: inherit;
+    margin-left: 4px;
+}
+
+.copy-btn:hover {
+    color: var(--text-primary);
+    background: var(--bg-primary);
+}
+
+.copy-btn:disabled {
+    opacity: 0.6;
+    cursor: not-allowed;
+}
+
+.copy-btn.copied {
+    color: #4caf50;
+    background: var(--bg-primary);
+    border-color: #4caf50;
+    transition: all 0.2s ease;
+}
+
+.raw-btn {
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-primary);
+    padding: 2px 6px;
+    border-radius: 2px;
+    color: var(--text-secondary);
+    cursor: pointer;
+    font-size: 0.75em;
+    font-family: inherit;
+    margin-left: 4px;
+    text-decoration: none;
+    display: inline-block;
+}
+
+.raw-btn:hover {
+    color: var(--text-primary);
+    background: var(--bg-primary);
+    text-decoration: none;
+}
+
+.github-btn {
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-primary);
+    padding: 2px 6px;
+    border-radius: 2px;
+    color: var(--text-secondary);
+    cursor: pointer;
+    font-size: 0.75em;
+    font-family: inherit;
+    margin-left: 4px;
+    text-decoration: none;
+    display: inline-block;
+}
+
+.github-btn:hover {
+    color: var(--text-primary);
+    background: var(--bg-primary);
+    text-decoration: none;
+}
+
+.hf-btn {
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-primary);
+    padding: 2px 6px;
+    border-radius: 2px;
+    color: var(--text-secondary);
+    cursor: pointer;
+    font-size: 0.75em;
+    font-family: inherit;
+    margin-left: 4px;
+    text-decoration: none;
+    display: inline-block;
+}
+
+.hf-btn:hover {
+    color: var(--text-primary);
+    background: var(--bg-primary);
+    text-decoration: none;
+}
+
+.output-stale {
+    opacity: 0.5;
+    position: relative;
+}
+
+.output-stale::after {
+    content: '⏳ updating...';
+    position: absolute;
+    top: 8px;
+    right: 8px;
+    background: var(--bg-secondary);
+    padding: 4px 8px;
+    border-radius: 2px;
+    font-size: 0.75em;
+    color: var(--text-secondary);
+    border: 1px solid var(--border-primary);
+}
+
+h1,
+h2,
+h3,
+h4,
+h5,
+h6 {
+    margin-top: 1.5rem;
+    margin-bottom: 0.75rem;
+    color: var(--text-primary);
+}
+
+h1 {
+    margin-top: 0;
+    margin-bottom: 1rem;
+}
+
+p {
+    margin: 0.75rem 0;
+    color: var(--text-primary);
+}
+
+a {
+    color: var(--text-link);
+}
+
+img {
+    max-width: 100%;
+    height: auto;
+    border-radius: 1px;
+    box-shadow: none;
+}
+
+pre,
+code {
+    font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace;
+    font-size: var(--code-font-size);
+}
+
+.code-wrap {
+    position: relative;
+}
+
+.code-line-highlight {
+    display: none;
+    position: absolute;
+    left: 0;
+    right: 0;
+    height: 1.5em;
+    background: rgba(255, 235, 170, 0.35);
+    pointer-events: none;
+    border-left: 3px solid #f4c542;
+}
+
+.line-number {
+    cursor: pointer;
+    text-decoration: none;
+    color: var(--text-secondary);
+    padding: 0 0.25rem;
+}
+
+.line-number.selected {
+    background: rgba(255, 235, 170, 0.4);
+    color: var(--text-primary);
+}
+
+/* Line numbers */
+.highlight-with-lines {
+    display: flex;
+}
+
+.line-numbers {
+    background: var(--bg-tertiary);
+    padding: var(--code-pad-y) 0.5rem;
+    font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace;
+    font-size: var(--code-font-size);
+    line-height: var(--code-line-height);
+    color: var(--text-secondary);
+    user-select: none;
+    text-align: right;
+    border-right: 1px solid var(--border-primary);
+}
+
+.line-numbers .line-number {
+    display: block;
+    line-height: var(--code-line-height);
+}
+
+.highlight-with-lines .highlight {
+    flex: 1;
+}
+
+.highlight .hll {
+    background-color: transparent;
+}
+
+/* don't conflict with our highlight */
+.highlight pre {
+    white-space: pre;
+    margin: 0;
+    padding: var(--code-pad-y) 0.75rem;
+    line-height: var(--code-line-height);
+}
+
+/* Collapsed code styling */
+.cell-code.collapsed {
+    display: none;
+}
+
+.cell-code.expanded {
+    display: block;
+}
+
+    {
+    % if config.collapse_code %
+}
+
+.cell-code {
+    display: none;
+}
+
+    {
+    % else %
+}
+
+.cell-code {
+    display: block;
+    border-bottom: 1px solid var(--border-primary);
+}
+
+    {
+    % endif %
+}
+
+    {
+        {
+        pygments_css
+    }
+}
+
+/* Ensure our code metrics override Pygments defaults */
+.highlight pre {
+    white-space: pre;
+    margin: 0;
+    padding: var(--code-pad-y) 0.75rem !important;
+    line-height: var(--code-line-height) !important;
+    font-size: var(--code-font-size) !important;
+    font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace !important;
+    border: none;
+}
+
+.line-numbers {
+    line-height: var(--code-line-height) !important;
+}
+
+.line-numbers .line-number {
+    line-height: var(--code-line-height) !important;
+}
+
+/* Custom CSS from frontmatter */
+    {
+        {
+        config.custom_css
+    }
+}
+
+    {
+    # Override code font size from frontmatter (accept number as px) #
+}
+
+    {
+    % if config.code_font_size is not none %
+}
+
+    {
+    % if config.code_font_size is string %
+}
+
+:root {
+    --code-font-size: {
+            {
+            config.code_font_size
+        }
+    }
+
+    ;
+}
+
+    {
+    % else %
+}
+
+:root {
+    --code-font-size: {
+            {
+            config.code_font_size
+        }
+    }
+
+    px;
+}
+
+    {
+    % endif %
+}
+
+    {
+    % endif %
+}
+
+/* Cursor for tools */
+body[data-tool="arrow"] .main-content {
+    cursor: url('data:image/svg+xml;utf8,<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="%23e53935" stroke-width="2"><path d="M12 19l7-7 3 3-7 7-3-3z"/><path d="M18 13l-1.5-7.5L2 2l3.5 14.5L13 18l5-5z"/><path d="M2 2l7.586 7.586"/><circle cx="11" cy="11" r="2"/></svg>') 12 12, crosshair;
+}
+
+body[data-tool="pen"] .main-content {
+    cursor: url('data:image/svg+xml;utf8,<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="%23e53935" stroke-width="2"><path d="M12 19l7-7 3 3-7 7-3-3z"/><path d="M18 13l-1.5-7.5L2 2l3.5 14.5L13 18l5-5z"/><circle cx="4" cy="20" r="2" fill="%23e53935"/></svg>') 4 20, pointer;
+}
+
+body[data-tool="eraser"] .main-content {
+    cursor: url('data:image/svg+xml;utf8,<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="%23e53935" stroke-width="2"><path d="M20 20H7l-7-7 7-7h13v14z"/><path d="M13 13l7-7"/><path d="M13 13L9 9"/></svg>') 12 12, auto;
+}
+
+/* Color picker styles */
+.tools-section-title {
+    font-weight: bold;
+    color: var(--text-secondary);
+    font-size: 0.65rem;
+    margin: 0.75rem 0 0.5rem 0;
+    text-transform: uppercase;
+    letter-spacing: 0.5px;
+}
+
+.color-row {
+    display: grid;
+    grid-template-columns: repeat(6, 1fr);
+    gap: 0.25rem;
+    margin-bottom: 0.5rem;
+}
+
+.color-swatch {
+    width: 18px;
+    height: 18px;
+    border: 2px solid var(--border-primary);
+    border-radius: 3px;
+    cursor: pointer;
+    transition: all 0.2s ease;
+    position: relative;
+}
+
+.color-swatch:hover {
+    transform: scale(1.1);
+    border-color: var(--text-secondary);
+}
+
+.color-swatch.selected {
+    border-color: var(--text-primary);
+    box-shadow: 0 0 0 2px var(--text-link);
+}
+
+.color-swatch.selected::after {
+    content: '✓';
+    position: absolute;
+    top: 50%;
+    left: 50%;
+    transform: translate(-50%, -50%);
+    color: white;
+    font-size: 10px;
+    font-weight: bold;
+    text-shadow: 1px 1px 1px black;
+}
+
+.color-input {
+    width: 24px;
+    height: 24px;
+    border: 2px solid var(--border-primary);
+    border-radius: 3px;
+    cursor: pointer;
+    background: none;
+    padding: 0;
+    grid-column: span 2;
+    justify-self: center;
+}
+
+.color-input:hover {
+    border-color: var(--text-secondary);
+}
+
+/* Thickness slider styles */
+.thickness-row {
+    display: flex;
+    align-items: center;
+    gap: 0.5rem;
+    margin-top: 0.75rem;
+}
+
+.thickness-slider {
+    flex: 1;
+    -webkit-appearance: none;
+    appearance: none;
+    height: 4px;
+    background: var(--border-primary);
+    border-radius: 2px;
+    outline: none;
+    opacity: 0.7;
+    transition: opacity 0.2s;
+}
+
+.thickness-slider:hover {
+    opacity: 1;
+}
+
+.thickness-slider::-webkit-slider-thumb {
+    -webkit-appearance: none;
+    appearance: none;
+    width: 12px;
+    height: 12px;
+    background: var(--text-link);
+    border-radius: 50%;
+    cursor: pointer;
+}
+
+.thickness-slider::-moz-range-thumb {
+    width: 12px;
+    height: 12px;
+    background: var(--text-link);
+    border-radius: 50%;
+    cursor: pointer;
+    border: none;
+}
+
+.thickness-value {
+    font-size: 0.7rem;
+    color: var(--text-secondary);
+    min-width: 20px;
+    text-align: right;
+}
+
+.highlight {
+    background: none !important;
+}
+
+/* Loading animations */
+.loading-spinner {
+    display: inline-block;
+    width: 16px;
+    height: 16px;
+    border: 2px solid var(--border-primary);
+    border-radius: 50%;
+    border-top-color: var(--text-link);
+    animation: spin 1s linear infinite;
+    margin-right: 8px;
+    vertical-align: middle;
+}
+
+@keyframes spin {
+    to {
+        transform: rotate(360deg);
+    }
+}
+
+.loading-skeleton {
+    display: inline-block;
+    background: var(--bg-tertiary);
+    background: linear-gradient(90deg,
+            var(--bg-tertiary) 25%,
+            var(--bg-secondary) 50%,
+            var(--bg-tertiary) 75%);
+    background-size: 200% 100%;
+    animation: loading-shimmer 2s ease-in-out infinite;
+    border-radius: 2px;
+    height: 1em;
+    width: 80px;
+    vertical-align: middle;
+}
+
+@keyframes loading-shimmer {
+    0% {
+        background-position: -200% 0;
+    }
+
+    100% {
+        background-position: 200% 0;
+    }
+}
+
+/* Loading state for cell output */
+.cell-output:has(.loading-spinner) {
+    opacity: 0.7;
+    background: var(--bg-secondary);
+    /* border-left: 3px solid var(--text-link); */
+}
+    </style>
+    <script>
+// --- Drag utilities ---
+        function clamp(val, min, max) { return Math.max(min, Math.min(max, val)); }
+
+        function restorePosition(el, storageKey) {
+            try {
+                const raw = localStorage.getItem(storageKey);
+                if (!raw) return;
+                const pos = JSON.parse(raw);
+                if (typeof pos.left === 'number' && typeof pos.top === 'number') {
+                    el.style.left = pos.left + 'px';
+                    el.style.top = pos.top + 'px';
+                    el.style.right = 'auto';
+                    el.style.bottom = 'auto';
+                }
+            } catch (_) {}
+        }
+
+        function savePosition(el, storageKey) {
+            try {
+                const left = parseFloat(el.style.left || 'NaN');
+                const top = parseFloat(el.style.top || 'NaN');
+                if (!Number.isNaN(left) && !Number.isNaN(top)) {
+                    localStorage.setItem(storageKey, JSON.stringify({ left, top }));
+                }
+            } catch (_) {}
+        }
+
+
+        function makeDraggable(el, storageKey, handleEl) {
+            let dragging = false;
+            let startX = 0, startY = 0; // cursor
+            let origLeft = 0, origTop = 0; // element
+
+            const onMove = (e) => {
+                if (!dragging) return;
+                const clientX = e.touches ? e.touches[0].clientX : e.clientX;
+                const clientY = e.touches ? e.touches[0].clientY : e.clientY;
+                const dx = clientX - startX;
+                const dy = clientY - startY;
+                const w = el.offsetWidth;
+                const h = el.offsetHeight;
+                const maxX = window.innerWidth - w;
+                const maxY = window.innerHeight - h;
+                const newLeft = clamp(origLeft + dx, 0, maxX);
+                const newTop = clamp(origTop + dy, 0, maxY);
+                el.style.left = newLeft + 'px';
+                el.style.top = newTop + 'px';
+                el.style.right = 'auto';
+                el.style.bottom = 'auto';
+            };
+
+            const endDrag = () => {
+                if (!dragging) return;
+                dragging = false;
+                document.removeEventListener('mousemove', onMove);
+                document.removeEventListener('mouseup', endDrag);
+                document.removeEventListener('touchmove', onMove);
+                document.removeEventListener('touchend', endDrag);
+                handleEl && (handleEl.style.cursor = 'grab');
+                savePosition(el, storageKey);
+                // ensure no-overlap constraint after a drag
+                try { layoutWidgetsStackedBottomRight(); } catch (_) {}
+            };
+
+            const startDrag = (e) => {
+                // Start from element's current on-screen rect
+                const elRect = el.getBoundingClientRect();
+                el.style.left = elRect.left + 'px';
+                el.style.top = elRect.top + 'px';
+                el.style.right = 'auto';
+                el.style.bottom = 'auto';
+
+                dragging = true;
+                startX = e.touches ? e.touches[0].clientX : e.clientX;
+                startY = e.touches ? e.touches[0].clientY : e.clientY;
+                origLeft = elRect.left;
+                origTop = elRect.top;
+
+                document.addEventListener('mousemove', onMove);
+                document.addEventListener('mouseup', endDrag);
+                document.addEventListener('touchmove', onMove, { passive: false });
+                document.addEventListener('touchend', endDrag);
+                handleEl && (handleEl.style.cursor = 'grabbing');
+                e.preventDefault();
+            };
+
+            (handleEl || el).addEventListener('mousedown', startDrag);
+            (handleEl || el).addEventListener('touchstart', startDrag, { passive: false });
+
+            // Apply any saved position on init
+            restorePosition(el, storageKey);
+        }
+        function toggleCell(cellId) {
+            const codeElement = document.getElementById('code-' + cellId);
+            const outputElement = document.getElementById('output-' + cellId);
+            
+            if (codeElement) {
+                codeElement.classList.toggle('collapsed');
+            }
+            if (outputElement) {
+                outputElement.classList.toggle('collapsed');
+            }
+            
+            updateIndicators(cellId);
+            encodeToolStateToUrl();
+        }
+        
+        function toggleCode(cellId) {
+            const codeElement = document.getElementById('code-' + cellId);
+            if (codeElement) {
+                codeElement.classList.toggle('collapsed');
+                updateIndicators(cellId);
+                encodeToolStateToUrl();
+            }
+        }
+        
+        function toggleOutput(cellId) {
+            const outputElement = document.getElementById('output-' + cellId);
+            if (outputElement) {
+                outputElement.classList.toggle('collapsed');
+                updateIndicators(cellId);
+                encodeToolStateToUrl();
+            }
+        }
+
+        function toggleUvLogs(headerElement) {
+            const contentElement = headerElement.nextElementSibling;
+            if (contentElement) {
+                const isCollapsed = contentElement.style.display === 'none';
+                contentElement.style.display = isCollapsed ? 'block' : 'none';
+                headerElement.textContent = isCollapsed ? '▼ UV Install Logs' : '▶ UV Install Logs';
+
+                // Update the header indicator if it exists
+                const uvLogsDiv = headerElement.parentElement;
+                if (uvLogsDiv && uvLogsDiv.id && uvLogsDiv.id.startsWith('uv-logs-')) {
+                    const cellId = uvLogsDiv.id.replace('uv-logs-', '');
+                    const indicatorElement = document.getElementById('uv-indicator-' + cellId);
+                    if (indicatorElement) {
+                        indicatorElement.textContent = isCollapsed ? '▼ uv-logs' : '▶ uv-logs';
+                    }
+                }
+            }
+        }
+
+        function toggleUvLogsFromHeader(cellId) {
+            const uvLogsElement = document.getElementById('uv-logs-' + cellId);
+            const indicatorElement = document.getElementById('uv-indicator-' + cellId);
+            if (uvLogsElement) {
+                const headerElement = uvLogsElement.querySelector('.uv-logs-header');
+                const contentElement = uvLogsElement.querySelector('.uv-logs-content');
+                if (contentElement && headerElement) {
+                    const isCollapsed = contentElement.style.display === 'none';
+                    contentElement.style.display = isCollapsed ? 'block' : 'none';
+                    headerElement.textContent = isCollapsed ? '▼ UV Install Logs' : '▶ UV Install Logs';
+                    if (indicatorElement) {
+                        indicatorElement.textContent = isCollapsed ? '▼ uv-logs' : '▶ uv-logs';
+                    }
+                }
+            }
+        }
+        
+        function updateIndicators(cellId) {
+            const codeElement = document.getElementById('code-' + cellId);
+            const outputElement = document.getElementById('output-' + cellId);
+            const indicators = document.querySelector(`[onclick*="${cellId}"]`)?.closest('.cell-header')?.querySelector('.collapse-indicators');
+            
+            if (indicators) {
+                const codeCollapsed = codeElement && codeElement.classList.contains('collapsed');
+                const outputCollapsed = outputElement && outputElement.classList.contains('collapsed');
+                
+                const codeIcon = codeCollapsed ? '▶' : '▼';
+                const outputIcon = outputCollapsed ? '▶' : '▼';
+                
+                const codeSpan = indicators.querySelector('[onclick*="toggleCode"]');
+                const outputSpan = indicators.querySelector('[onclick*="toggleOutput"]');
+                
+                if (codeSpan) codeSpan.innerHTML = `${codeIcon} code`;
+                if (outputSpan) outputSpan.innerHTML = `${outputIcon} output`;
+            }
+        }
+        
+        function toggleTheme() {
+            const html = document.documentElement;
+            const currentTheme = html.getAttribute('data-theme');
+            const newTheme = currentTheme === 'dark' ? 'light' : 'dark';
+            html.setAttribute('data-theme', newTheme);
+            localStorage.setItem('uvnote-theme', newTheme);
+            updateThemeIcon();
+            updateUiDebug();
+        }
+        
+        // Two panel code removed
+        
+        function updateThemeIcon() {
+            const theme = document.documentElement.getAttribute('data-theme');
+            const toggle = document.querySelector('.theme-toggle');
+            if (toggle) {
+                toggle.textContent = theme === 'dark' ? 'light' : 'dark';
+            }
+        }
+        function setUiTheme(newUi) {
+            if (newUi !== 'default' && newUi !== 'none' && newUi !== 'monocolor') return;
+            const html = document.documentElement;
+            html.setAttribute('data-ui', newUi);
+            try { localStorage.setItem('uvnote-ui', newUi); } catch (_) {}
+            updateUiMenu();
+            updateUiDebug();
+        }
+        function updateUiMenu() {
+            const ui = document.documentElement.getAttribute('data-ui') || 'default';
+            const checks = {
+                default: document.getElementById('checkbox-ui-default'),
+                none: document.getElementById('checkbox-ui-none'),
+                monocolor: document.getElementById('checkbox-ui-monocolor')
+            };
+            if (checks.default) checks.default.textContent = ui === 'default' ? '☑' : '☐';
+            if (checks.none) checks.none.textContent = ui === 'none' ? '☑' : '☐';
+            if (checks.monocolor) checks.monocolor.textContent = ui === 'monocolor' ? '☑' : '☐';
+        }
+
+        function updateUiDebug() {
+            const ui = document.documentElement.getAttribute('data-ui') || 'default';
+            const color = document.documentElement.getAttribute('data-theme') || 'light';
+            const el = document.getElementById('ui-debug');
+            if (el) {
+                el.textContent = `UI: ${ui} | Color: ${color}`;
+            }
+        }
+
+        // Line selection and deep-linking
+        function clearLineSelections() {
+            try {
+                document.querySelectorAll('.code-line-highlight').forEach(el => { el.style.display = 'none'; });
+                document.querySelectorAll('.line-number.selected').forEach(el => el.classList.remove('selected'));
+            } catch (_) {}
+        }
+
+        let _selection = null; // { cellId, a, b }
+
+        function clearSelection(updateUrl) {
+            clearLineSelections();
+            _selection = null;
+            if (updateUrl) {
+                try {
+                    const url = new URL(window.location.href);
+                    url.searchParams.delete('cell');
+                    url.searchParams.delete('line');
+                    history.replaceState(null, '', url.toString());
+                } catch (_) {}
+            }
+            updateStateIndicator();
+        }
+
+        function selectCellLine(cellId, line, updateUrl) {
+            try {
+                // Ensure only one selection across the whole document
+                clearLineSelections();
+                const codeBox = document.getElementById(`code-${cellId}`);
+                if (!codeBox) return;
+                const pre = codeBox.querySelector('.highlight pre');
+                const overlay = document.getElementById(`line-highlight-${cellId}`);
+                const numbers = document.getElementById(`lines-${cellId}`);
+                if (!pre || !overlay) return;
+
+                // Measure line height directly from computed style
+                const preStyle = getComputedStyle(pre);
+                const padTop = parseFloat(preStyle.paddingTop || '0');
+                const lh = parseFloat(preStyle.lineHeight || '20');
+
+                // Position overlay
+                overlay.style.display = 'block';
+                overlay.style.height = `${lh}px`;
+                overlay.style.top = `${pre.offsetTop + padTop + (line - 1) * lh}px`;
+
+                // Update selected class in line numbers
+                if (numbers) {
+                    numbers.querySelectorAll('.line-number').forEach(a => a.classList.remove('selected'));
+                    const sel = numbers.querySelector(`.line-number[data-line="${line}"]`);
+                    if (sel) sel.classList.add('selected');
+                }
+
+                if (updateUrl) {
+                    const url = new URL(window.location.href);
+                    url.searchParams.set('cell', cellId);
+                    url.searchParams.set('line', String(line));
+                    history.replaceState(null, '', url.toString());
+                }
+                _selection = { cellId, a: line, b: line };
+                updateStateIndicator();
+            } catch (e) { console.warn('selectCellLine error', e); }
+        }
+
+        function selectCellLines(cellId, startLine, endLine, updateUrl) {
+            try {
+                // normalize order
+                const a = Math.min(startLine, endLine);
+                const b = Math.max(startLine, endLine);
+                clearLineSelections();
+                const codeBox = document.getElementById(`code-${cellId}`);
+                if (!codeBox) return;
+                const pre = codeBox.querySelector('.highlight pre');
+                const overlay = document.getElementById(`line-highlight-${cellId}`);
+                const numbers = document.getElementById(`lines-${cellId}`);
+                if (!pre || !overlay) return;
+
+                const preStyle = getComputedStyle(pre);
+                const padTop = parseFloat(preStyle.paddingTop || '0');
+                const lh = parseFloat(preStyle.lineHeight || '20');
+
+                overlay.style.display = 'block';
+                overlay.style.top = `${pre.offsetTop + padTop + (a - 1) * lh}px`;
+                overlay.style.height = `${(b - a + 1) * lh}px`;
+
+                if (numbers) {
+                    numbers.querySelectorAll('.line-number').forEach(a => a.classList.remove('selected'));
+                    for (let i = a; i <= b; i++) {
+                        const el = numbers.querySelector(`.line-number[data-line="${i}"]`);
+                        if (el) el.classList.add('selected');
+                    }
+                }
+
+                if (updateUrl) {
+                    const url = new URL(window.location.href);
+                    url.searchParams.set('cell', cellId);
+                    if (a === b) url.searchParams.set('line', String(a));
+                    else url.searchParams.set('line', `${a}-${b}`);
+                    history.replaceState(null, '', url.toString());
+                }
+                _selection = { cellId, a, b };
+                updateStateIndicator();
+            } catch (e) { console.warn('selectCellLines error', e); }
+        }
+
+        // Drag-to-select support on line numbers
+        let _lineDrag = { active: false, cellId: null, start: 0 };
+        function onLineNumberMouseDown(e) {
+            const a = e.target.closest('.line-number');
+            if (!a) return;
+            e.preventDefault();
+            const cellId = a.dataset.cell;
+            const line = parseInt(a.dataset.line || '1', 10) || 1;
+            // Toggle off if this exact single line is already the only selection
+            const numbers = document.getElementById(`lines-${cellId}`);
+            if (numbers) {
+                const selected = Array.from(numbers.querySelectorAll('.line-number.selected')).map(n => parseInt(n.dataset.line||'0',10)).filter(Boolean);
+                if (selected.length === 1 && selected[0] === line) {
+                    clearSelection(true);
+                    return;
+                }
+            }
+            _lineDrag.active = true;
+            _lineDrag.cellId = cellId;
+            _lineDrag.start = line;
+            selectCellLines(_lineDrag.cellId, _lineDrag.start, _lineDrag.start, false);
+        }
+        function onDocMouseMove(e) {
+            if (!_lineDrag.active) return;
+            const el = document.elementFromPoint(e.clientX, e.clientY);
+            if (!el) return;
+            const a = el.closest && el.closest('.line-number');
+            if (!a) return;
+            if (a.dataset.cell !== _lineDrag.cellId) return;
+            const cur = parseInt(a.dataset.line || '1', 10) || 1;
+            selectCellLines(_lineDrag.cellId, _lineDrag.start, cur, false);
+        }
+        function onDocMouseUp(e) {
+            if (!_lineDrag.active) return;
+            const last = document.querySelector('.line-number.selected:last-of-type');
+            // finalize URL using the current selected range
+            const numbers = document.getElementById(`lines-${_lineDrag.cellId}`);
+            if (numbers) {
+                const selected = Array.from(numbers.querySelectorAll('.line-number.selected')).map(n => parseInt(n.dataset.line||'0',10)).filter(Boolean);
+                if (selected.length) {
+                    const a = Math.min(...selected); const b = Math.max(...selected);
+                    selectCellLines(_lineDrag.cellId, a, b, true);
+                }
+            }
+            _lineDrag.active = false; _lineDrag.cellId = null; _lineDrag.start = 0;
+        }
+
+        function applyLocationFromUrl() {
+            try {
+                const url = new URL(window.location.href);
+                const cell = url.searchParams.get('cell');
+                const lineParam = url.searchParams.get('line');
+                if (cell && lineParam) {
+                    if (lineParam.includes('-')) {
+                        const [a, b] = lineParam.split('-').map(x => parseInt(x, 10));
+                        if (!Number.isNaN(a) && !Number.isNaN(b)) selectCellLines(cell, a, b, false);
+                    } else {
+                        const l = parseInt(lineParam, 10);
+                        if (!Number.isNaN(l)) selectCellLine(cell, l, false);
+                    }
+                }
+                
+                // Apply tool parameters from URL
+                applyToolsFromUrl(url.searchParams);
+                
+                // Cell states will be applied later in DOMContentLoaded with proper timing
+                const encodedCellStates = url.searchParams.get('cells');
+                console.log('Encoded cell states from URL:', encodedCellStates);
+            } catch (_) {}
+        }
+        
+        function applyToolsFromUrl(params) {
+            try {
+                // Check if tools widget should be shown
+                const showTools = params.get('tools');
+                if (showTools === '1') {
+                    // Mark that tool was loaded from URL
+                    _urlLoadedTool = true;
+                    
+                    // Apply color
+                    const color = params.get('color');
+                    if (color && /^[0-9a-fA-F]{6}$/.test(color)) {
+                        setStoredArrowColor('#' + color);
+                    }
+                    
+                    // Apply thickness
+                    const thickness = params.get('thickness');
+                    if (thickness) {
+                        const value = parseInt(thickness, 10);
+                        if (value >= 1 && value <= 10) {
+                            setStoredLineThickness(value);
+                        }
+                    }
+                    
+                    // Don't override fadeout time for URL-loaded tools - let individual shapes decide
+                    
+                    // Load shapes from URL
+                    const encodedShapes = params.get('shapes');
+                    if (encodedShapes) {
+                        const decodedShapes = decodeShapesFromUrl(encodedShapes);
+                        if (decodedShapes.length > 0) {
+                            _shapes = decodedShapes;
+                            saveShapes();
+                            // Trigger render after overlay is initialized
+                            setTimeout(() => {
+                                renderOverlay();
+                            }, 300);
+                        }
+                    }
+                    
+                    // Wait for widgets to be initialized before showing tools
+                    setTimeout(() => {
+                        const toolsWidget = document.querySelector('.tools-widget');
+                        const checkbox = document.getElementById('checkbox-tools');
+                        if (toolsWidget && checkbox) {
+                            toolsWidget.style.display = 'block';
+                            checkbox.textContent = '☑';
+                            localStorage.setItem('uvnote-widget-tools', 'visible');
+                        }
+                        
+                        // Apply active tool
+                        const activeTool = params.get('tool');
+                        if (activeTool && ['arrow', 'pen', 'eraser', 'spotlight'].includes(activeTool)) {
+                            const toolBtn = Array.from(document.querySelectorAll('.tool-button')).find(btn => btn.textContent === activeTool);
+                            if (toolBtn) {
+                                toolBtn.click();
+                            }
+                        }
+                        
+                        // Re-layout widgets after showing tools
+                        layoutWidgetsStackedBottomRight();
+                    }, 200);
+                }
+            } catch (_) {}
+        }
+        
+        function captureInitialCellStates() {
+            const cells = document.querySelectorAll('.cell');
+            cells.forEach(cell => {
+                const cellId = cell.id.replace('cell-', '');
+                const codeEl = document.getElementById('code-' + cellId);
+                const outputEl = document.getElementById('output-' + cellId);
+                
+                if (codeEl || outputEl) {
+                    const state = {};
+                    if (codeEl) {
+                        state.c = codeEl.classList.contains('collapsed') ? 0 : 1;
+                    }
+                    if (outputEl) {
+                        state.o = outputEl.classList.contains('collapsed') ? 0 : 1;
+                    }
+                    _initialCellStates[cellId] = state;
+                }
+            });
+            console.log('Captured initial cell states:', _initialCellStates);
+        }
+
+        function encodeCellStatesToUrl() {
+            // Get all cells and their collapse states
+            const cells = document.querySelectorAll('.cell');
+            const cellStates = {};
+            
+            console.log('Found cells:', cells.length);
+            
+            cells.forEach(cell => {
+                const cellId = cell.id.replace('cell-', '');
+                const codeEl = document.getElementById('code-' + cellId);
+                const outputEl = document.getElementById('output-' + cellId);
+                const initialState = _initialCellStates[cellId] || {};
+                
+                console.log(`Encoding cell ${cellId}:`, {
+                    codeEl: !!codeEl,
+                    outputEl: !!outputEl,
+                    codeCollapsed: codeEl ? codeEl.classList.contains('collapsed') : 'N/A',
+                    outputCollapsed: outputEl ? outputEl.classList.contains('collapsed') : 'N/A',
+                    initialState: initialState
+                });
+                
+                if (codeEl || outputEl) {
+                    const state = {};
+                    let hasChanges = false;
+                    
+                    if (codeEl) {
+                        const currentCodeState = codeEl.classList.contains('collapsed') ? 0 : 1;
+                        const initialCodeState = initialState.c;
+                        // Only encode if different from initial state
+                        if (initialCodeState !== undefined && currentCodeState !== initialCodeState) {
+                            state.c = currentCodeState;
+                            hasChanges = true;
+                        }
+                    }
+                    
+                    if (outputEl) {
+                        const currentOutputState = outputEl.classList.contains('collapsed') ? 0 : 1;
+                        const initialOutputState = initialState.o;
+                        // Only encode if different from initial state
+                        if (initialOutputState !== undefined && currentOutputState !== initialOutputState) {
+                            state.o = currentOutputState;
+                            hasChanges = true;
+                        }
+                    }
+                    
+                    // Only include cell if it has changes from initial state
+                    if (hasChanges) {
+                        cellStates[cellId] = state;
+                        console.log(`Added cell ${cellId}:`, state);
+                    }
+                }
+            });
+            
+            console.log('Final cell states to encode:', cellStates);
+            
+            // Return empty string if no changed cells
+            if (Object.keys(cellStates).length === 0) return '';
+            
+            // Encode as compact base64 string
+            const encoded = btoa(JSON.stringify(cellStates));
+            console.log('Encoded cell states:', encoded);
+            return encoded;
+        }
+        
+        function decodeCellStatesFromUrl(encodedStates) {
+            if (!encodedStates) return {};
+            
+            try {
+                return JSON.parse(atob(encodedStates));
+            } catch (e) {
+                console.error('Failed to decode cell states:', e);
+                return {};
+            }
+        }
+        
+        function applyCellStatesFromUrl(cellStates) {
+            console.log('Applying cell states from URL:', cellStates);
+            Object.entries(cellStates).forEach(([cellId, state]) => {
+                const codeEl = document.getElementById('code-' + cellId);
+                const outputEl = document.getElementById('output-' + cellId);
+                
+                console.log(`Cell ${cellId}:`, {
+                    codeEl: !!codeEl,
+                    outputEl: !!outputEl,
+                    state: state
+                });
+                
+                if (codeEl && state.c !== undefined) {
+                    if (state.c === 0) {
+                        codeEl.classList.add('collapsed');
+                        console.log(`Collapsed code for cell ${cellId}`, {
+                            hasCollapsedClass: codeEl.classList.contains('collapsed'),
+                            computedDisplay: getComputedStyle(codeEl).display,
+                            classList: Array.from(codeEl.classList),
+                            elementId: codeEl.id
+                        });
+                    } else {
+                        codeEl.classList.remove('collapsed');
+                        codeEl.classList.add('expanded'); // Explicitly add expanded class
+                        console.log(`Expanded code for cell ${cellId}`, {
+                            hasCollapsedClass: codeEl.classList.contains('collapsed'),
+                            hasExpandedClass: codeEl.classList.contains('expanded'),
+                            computedDisplay: getComputedStyle(codeEl).display,
+                            classList: Array.from(codeEl.classList),
+                            elementId: codeEl.id
+                        });
+                    }
+                }
+                
+                if (outputEl && state.o !== undefined) {
+                    if (state.o === 0) {
+                        outputEl.classList.add('collapsed');
+                        console.log(`Collapsed output for cell ${cellId}`);
+                    } else {
+                        outputEl.classList.remove('collapsed');
+                        console.log(`Expanded output for cell ${cellId}`);
+                    }
+                }
+                
+                // Update visual indicators and force style recalculation
+                try {
+                    updateIndicators(cellId);
+                    // Force browser to recalculate styles
+                    if (codeEl) {
+                        codeEl.offsetHeight; // Force reflow
+                        console.log(`After indicators update - code visible: ${getComputedStyle(codeEl).display !== 'none'}`);
+                    }
+                    if (outputEl) {
+                        outputEl.offsetHeight; // Force reflow
+                        console.log(`After indicators update - output visible: ${getComputedStyle(outputEl).display !== 'none'}`);
+                    }
+                } catch (e) {
+                    console.error(`Error updating indicators for cell ${cellId}:`, e);
+                }
+            });
+        }
+
+        function encodeShapesToUrl() {
+            // Encode shapes as compact base64 string
+            if (_shapes.length === 0) return '';
+            
+            const shapeData = _shapes.map(shape => {
+                const baseData = {
+                    ct: shape.createdAt, // creation timestamp
+                    fo: shape.fadeoutTime || getFadeoutTime() // fadeout time for this shape
+                };
+                
+                if (shape.type === 'arrow') {
+                    return {
+                        ...baseData,
+                        t: 'a',
+                        x1: Math.round(shape.x1),
+                        y1: Math.round(shape.y1),
+                        x2: Math.round(shape.x2),
+                        y2: Math.round(shape.y2),
+                        c: shape.color.substring(1), // remove #
+                        w: shape.width
+                    };
+                } else if (shape.type === 'pen') {
+                    return {
+                        ...baseData,
+                        t: 'p',
+                        pts: shape.points.map(p => [Math.round(p.x), Math.round(p.y)]),
+                        c: shape.color.substring(1),
+                        w: shape.width
+                    };
+                } else if (shape.type === 'spotlight') {
+                    return {
+                        ...baseData,
+                        t: 's',
+                        x: Math.round(shape.x),
+                        y: Math.round(shape.y),
+                        r: Math.round(shape.radius)
+                    };
+                }
+            }).filter(Boolean);
+            
+            return btoa(JSON.stringify(shapeData));
+        }
+        
+        function decodeShapesFromUrl(encodedShapes) {
+            if (!encodedShapes) return [];
+            
+            try {
+                const shapeData = JSON.parse(atob(encodedShapes));
+                return shapeData.map(data => {
+                    const base = {
+                        createdAt: data.ct || Date.now(), // use encoded timestamp or current time
+                        fadeoutTime: data.fo || 0, // use encoded fadeout time or 0 (never fade)
+                        opacity: 1.0
+                    };
+                    
+                    if (data.t === 'a') {
+                        return {
+                            ...base,
+                            type: 'arrow',
+                            x1: data.x1,
+                            y1: data.y1,
+                            x2: data.x2,
+                            y2: data.y2,
+                            color: '#' + data.c,
+                            width: data.w
+                        };
+                    } else if (data.t === 'p') {
+                        return {
+                            ...base,
+                            type: 'pen',
+                            points: data.pts.map(([x, y]) => ({ x, y })),
+                            color: '#' + data.c,
+                            width: data.w
+                        };
+                    } else if (data.t === 's') {
+                        return {
+                            ...base,
+                            type: 'spotlight',
+                            x: data.x,
+                            y: data.y,
+                            radius: data.r,
+                            color: '#000000'
+                        };
+                    }
+                }).filter(Boolean);
+            } catch (e) {
+                console.error('Failed to decode shapes:', e);
+                return [];
+            }
+        }
+        
+        function encodeToolStateToUrl() {
+            // Don't update URL during initialization
+            if (_isInitializing) {
+                return window.location.href;
+            }
+            
+            const params = new URLSearchParams(window.location.search);
+            
+            // Check if tools widget is visible and has an active tool
+            const toolsWidget = document.querySelector('.tools-widget');
+            const activeTool = document.body.dataset.tool;
+            const hasActiveTool = activeTool && activeTool !== 'none';
+            const toolsWidgetVisible = toolsWidget && getComputedStyle(toolsWidget).display !== 'none';
+            
+            // Always handle shapes regardless of tool state
+            const encodedShapes = encodeShapesToUrl();
+            if (encodedShapes) {
+                params.set('shapes', encodedShapes);
+            } else {
+                params.delete('shapes');
+            }
+            
+            // Always preserve existing cell states from URL if present
+            const existingCellStates = params.get('cells');
+            if (existingCellStates) {
+                // Keep existing cell states - don't re-encode from DOM
+                params.set('cells', existingCellStates);
+            } else {
+                // Only encode new cell states if none exist in URL
+                const encodedCellStates = encodeCellStatesToUrl();
+                if (encodedCellStates) {
+                    params.set('cells', encodedCellStates);
+                }
+            }
+            
+            if (toolsWidgetVisible && hasActiveTool) {
+                // Include tool params when widget is visible AND tool is active
+                params.set('tools', '1');
+                params.set('tool', activeTool);
+                
+                // Get color (without # prefix)
+                const color = getArrowColor();
+                if (color && color.startsWith('#')) {
+                    params.set('color', color.substring(1));
+                }
+                
+                // Get thickness
+                const thickness = getLineThickness();
+                params.set('thickness', thickness.toString());
+            } else {
+                // Remove tool state params but keep shapes
+                params.delete('tools');
+                params.delete('tool');
+                params.delete('color');
+                params.delete('thickness');
+                params.delete('fadeout');
+            }
+            
+            // Update URL without reloading
+            const newUrl = window.location.pathname + (params.toString() ? '?' + params.toString() : '') + window.location.hash;
+            window.history.replaceState(null, '', newUrl);
+            
+            return window.location.href;
+        }
+
+        function resetLayout() {
+            try {
+                // Clear all uvnote-* keys
+                const allKeys = Object.keys(localStorage);
+                const uvnoteKeys = allKeys.filter(key => key.startsWith('uvnote-'));
+                uvnoteKeys.forEach(k => localStorage.removeItem(k));
+            } catch (_) {}
+            
+            // Clear any active selection and remove URL params
+            try { clearSelection(true); } catch(_) {}
+            // Reset active tool if any
+            try { window.setActiveTool('none'); } catch(_) {}
+            // Clear shapes
+            try { _shapes = []; saveShapes(); } catch(_) {}
+            // Reset URL-loaded tool flag
+            try { _urlLoadedTool = false; } catch(_) {}
+            // Reset all cells to expanded state
+            try {
+                const cells = document.querySelectorAll('.cell');
+                cells.forEach(cell => {
+                    const cellId = cell.id.replace('cell-', '');
+                    const codeEl = document.getElementById('code-' + cellId);
+                    const outputEl = document.getElementById('output-' + cellId);
+                    if (codeEl) codeEl.classList.remove('collapsed');
+                    if (outputEl) outputEl.classList.remove('collapsed');
+                    updateIndicators(cellId);
+                });
+            } catch(_) {}
+            
+            // Clear ALL URL parameters and reload with clean URL
+            try {
+                const cleanUrl = window.location.pathname + window.location.hash;
+                window.location.href = cleanUrl; // Use window.location.href instead of history.replaceState + reload
+            } catch (_) {
+                // Fallback - reload current page
+                location.reload();
+            }
+        }
+
+        function toggleMenu() {
+            const menuButton = document.querySelector('.menu-button');
+            if (menuButton) {
+                menuButton.classList.toggle('active');
+            }
+        }
+
+        function toggleWidget(widgetName) {
+            let widget;
+            let checkbox;
+            
+            // Close the menu first
+            const menuButton = document.querySelector('.menu-button');
+            if (menuButton) {
+                menuButton.classList.remove('active');
+            }
+            
+            switch(widgetName) {
+                case 'tools':
+                    widget = document.querySelector('.tools-widget');
+                    checkbox = document.getElementById('checkbox-tools');
+                    break;
+                case 'file-explorer':
+                    widget = document.querySelector('.file-explorer');
+                    checkbox = document.getElementById('checkbox-file-explorer');
+                    break;
+                case 'minimap':
+                    widget = document.querySelector('.minimap');
+                    checkbox = document.getElementById('checkbox-minimap');
+                    break;
+                case 'status':
+                    widget = document.querySelector('.status-widget');
+                    checkbox = document.getElementById('checkbox-status');
+                    break;
+                default:
+                    return;
+            }
+            
+            if (widget && checkbox) {
+                const isVisible = getComputedStyle(widget).display !== 'none';
+                widget.style.display = isVisible ? 'none' : 'block';
+                checkbox.textContent = isVisible ? '☐' : '☑';
+                
+                // Save state to localStorage
+                try {
+                    localStorage.setItem(`uvnote-widget-${widgetName}`, isVisible ? 'hidden' : 'visible');
+                } catch (_) {}
+                
+                // Re-layout widgets after visibility change
+                try { 
+                    layoutWidgetsStackedBottomRight(); 
+                } catch (_) {}
+                
+                // Update URL when tools widget visibility changes
+                if (widgetName === 'tools') {
+                    encodeToolStateToUrl();
+                }
+            }
+        }
+        
+        function initializeWidgetVisibility() {
+            const widgets = [
+                { name: 'tools', selector: '.tools-widget' },
+                { name: 'file-explorer', selector: '.file-explorer' },
+                { name: 'minimap', selector: '.minimap' },
+                { name: 'status', selector: '.status-widget' }
+            ];
+            
+            widgets.forEach(({ name, selector }) => {
+                const defaultState = name === 'status' ? 'visible' : 'hidden';
+                const savedState = localStorage.getItem(`uvnote-widget-${name}`) || defaultState;
+                const widget = document.querySelector(selector);
+                const checkbox = document.getElementById(`checkbox-${name}`);
+                
+                if (widget && checkbox) {
+                    const isVisible = savedState === 'visible';
+                    widget.style.display = isVisible ? 'block' : 'none';
+                    checkbox.textContent = isVisible ? '☑' : '☐';
+                }
+            });
+        }
+
+        // Close menu when clicking outside
+        document.addEventListener('click', function(event) {
+            const menuButton = document.querySelector('.menu-button');
+            // Don't close if clicking on a menu item (let the item handler close it)
+            if (menuButton && !menuButton.contains(event.target)) {
+                menuButton.classList.remove('active');
+            }
+        });
+
+        // Layout: stack widgets bottom-right and equalize widths
+        function hasCustomWidgetPositions() {
+            try {
+                return (
+                    localStorage.getItem('uvnote-minimap-pos') ||
+                    localStorage.getItem('uvnote-file-explorer-pos') ||
+                    localStorage.getItem('uvnote-tools-pos')
+                );
+            } catch (_) { return false; }
+        }
+
+        function rectsOverlap(r1, r2) {
+            return !(r1.right <= r2.left || r2.right <= r1.left || r1.bottom <= r2.top || r2.bottom <= r1.top);
+        }
+
+        function widgetsOverlap(widgets) {
+            for (let i = 0; i < widgets.length; i++) {
+                const a = widgets[i];
+                const ra = a.getBoundingClientRect();
+                for (let j = i + 1; j < widgets.length; j++) {
+                    const b = widgets[j];
+                    const rb = b.getBoundingClientRect();
+                    if (rectsOverlap(ra, rb)) return true;
+                }
+            }
+            return false;
+        }
+
+        function applyStackLayout(widgets, order) {
+            if (!widgets.length) return;
+            // Fixed equal width
+            const fixedWidth = 220;
+            widgets.forEach(el => { el.style.width = fixedWidth + 'px'; });
+
+            // Fit heights if needed to avoid overflow
+            const gap = 12;
+            const available = Math.max(0, window.innerHeight - 40 - gap * (order.length - 1));
+            const eachMax = Math.floor(available / order.length);
+            order.forEach(el => {
+                el.style.maxHeight = eachMax + 'px';
+                el.style.overflowY = 'auto';
+            });
+
+            // Stack bottom-up in the requested order
+            let bottomOffset = 20; // base gutter
+            order.forEach(el => {
+                el.style.left = 'auto';
+                el.style.top = 'auto';
+                el.style.right = '20px';
+                el.style.bottom = bottomOffset + 'px';
+                bottomOffset += el.offsetHeight + gap;
+            });
+        }
+
+        function layoutWidgetsStackedBottomRight() {
+            const minimap = document.querySelector('.minimap');
+            const fileExplorer = document.querySelector('.file-explorer');
+            const tools = document.querySelector('.tools-widget');
+            const status = document.querySelector('.status-widget');
+            const widgets = [minimap, fileExplorer, tools, status].filter(el => el && getComputedStyle(el).display !== 'none');
+            if (!widgets.length) return;
+
+            const order = [minimap, fileExplorer, tools, status].filter(Boolean).filter(el => getComputedStyle(el).display !== 'none');
+
+            // If user placed custom positions and there is no overlap, respect them.
+            if (hasCustomWidgetPositions() && !widgetsOverlap(widgets)) return;
+
+            applyStackLayout(widgets, order);
+        }
+        
+        // Panel icon removed
+        
+        let _minimapScrollContainer = null;
+        let _minimapScrollHandler = null;
+        function initMinimap() {
+            // Generate minimap content
+            const minimap = createMinimap();
+            document.body.appendChild(minimap);
+            // Make draggable (use title as handle)
+            const mTitle = minimap.querySelector('.minimap-title');
+            makeDraggable(minimap, 'uvnote-minimap-pos', mTitle);
+
+            // Attach scroll listener to window (two-panel removed)
+            _minimapScrollContainer = window;
+
+            if (_minimapScrollContainer) {
+                _minimapScrollHandler = () => updateMinimapActive();
+                if (_minimapScrollContainer === window) {
+                    window.addEventListener('scroll', _minimapScrollHandler);
+                } else {
+                    _minimapScrollContainer.addEventListener('scroll', _minimapScrollHandler);
+                }
+            }
+            updateMinimapActive();
+        }
+
+        function teardownMinimap() {
+            const minimap = document.querySelector('.minimap');
+            if (minimap && minimap.parentNode) minimap.parentNode.removeChild(minimap);
+            if (_minimapScrollContainer && _minimapScrollHandler) {
+                if (_minimapScrollContainer === window) {
+                    window.removeEventListener('scroll', _minimapScrollHandler);
+                } else {
+                    _minimapScrollContainer.removeEventListener('scroll', _minimapScrollHandler);
+                }
+            }
+            _minimapScrollContainer = null;
+            _minimapScrollHandler = null;
+        }
+        
+        function initFileExplorer() {
+            // Generate file explorer content
+            const fileExplorer = createFileExplorer();
+            document.body.appendChild(fileExplorer);
+        }
+        
+        function createMinimap() {
+            const minimap = document.createElement('div');
+            minimap.className = 'minimap';
+            
+            const title = document.createElement('div');
+            title.className = 'minimap-title';
+            title.textContent = 'navigation';
+            minimap.appendChild(title);
+            
+            // Find all headings and cells
+            const root = document.querySelector('.main-content') || document;
+            const headings = root.querySelectorAll('h1, h2, h3, h4, h5, h6');
+            const cells = root.querySelectorAll('.cell');
+            
+            // Combine and sort by position
+            const items = [];
+            
+            headings.forEach(heading => {
+                const id = heading.id || generateId(heading.textContent);
+                if (!heading.id) heading.id = id;
+                
+                items.push({
+                    element: heading,
+                    type: 'heading',
+                    level: parseInt(heading.tagName.charAt(1)),
+                    text: heading.textContent.trim(),
+                    id: id,
+                    position: heading.getBoundingClientRect().top + window.scrollY
+                });
+            });
+            
+            cells.forEach(cell => {
+                const header = cell.querySelector('.cell-header');
+                if (header) {
+                    const id = cell.id || `cell-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`;
+                    if (!cell.id) cell.id = id;
+                    
+                    items.push({
+                        element: cell,
+                        type: 'cell',
+                        text: header.textContent.trim(),
+                        id: id,
+                        position: cell.getBoundingClientRect().top + window.scrollY
+                    });
+                }
+            });
+            
+            // Sort by position
+            items.sort((a, b) => a.position - b.position);
+            
+            // Create minimap items
+            items.forEach(item => {
+                const link = document.createElement('a');
+                link.className = `minimap-item ${item.type === 'heading' ? 'minimap-heading' : 'minimap-cell'}`;
+                if (item.type === 'heading') {
+                    link.classList.add(`h${item.level}`);
+                }
+                link.textContent = item.text.length > 25 ? item.text.substring(0, 22) + '...' : item.text;
+                link.href = `#${item.id}`;
+                link.onclick = function(e) {
+                    e.preventDefault();
+                    item.element.scrollIntoView({ behavior: 'smooth', block: 'start' });
+                };
+                minimap.appendChild(link);
+            });
+            
+            return minimap;
+        }
+        
+        function generateId(text) {
+            return text.toLowerCase()
+                .replace(/[^a-z0-9]+/g, '-')
+                .replace(/^-+|-+$/g, '')
+                .substring(0, 20);
+        }
+        
+        function updateMinimapActive() {
+            const minimapItems = document.querySelectorAll('.minimap-item');
+            const container = _minimapScrollContainer || window;
+            const containerRect = container === window ? null : container.getBoundingClientRect();
+            const scrollPos = (container === window ? window.scrollY : container.scrollTop) + 100; // Offset for better detection
+            
+            let activeItem = null;
+            minimapItems.forEach(item => {
+                const targetId = item.getAttribute('href').substring(1);
+                const target = document.getElementById(targetId);
+                
+                if (target) {
+                    const rectTop = target.getBoundingClientRect().top;
+                    const targetPos = (container === window)
+                        ? rectTop + window.scrollY
+                        : rectTop - containerRect.top + container.scrollTop;
+                    if (targetPos <= scrollPos) {
+                        activeItem = item;
+                    }
+                }
+                
+                item.classList.remove('active');
+            });
+            
+            if (activeItem) {
+                activeItem.classList.add('active');
+            }
+        }
+        
+        function createFileExplorer() {
+            const fileExplorer = document.createElement('div');
+            fileExplorer.className = 'file-explorer';
+            
+            const title = document.createElement('div');
+            title.className = 'file-explorer-title';
+            title.textContent = 'files';
+            fileExplorer.appendChild(title);
+            // Make draggable (use title as handle)
+            makeDraggable(fileExplorer, 'uvnote-file-explorer-pos', title);
+            
+            // Scripts section
+            const scriptsSection = document.createElement('div');
+            scriptsSection.className = 'file-explorer-section';
+            
+            const scriptsTitle = document.createElement('div');
+            scriptsTitle.className = 'file-explorer-section-title';
+            scriptsTitle.textContent = 'scripts';
+            scriptsSection.appendChild(scriptsTitle);
+            
+            // Find all cells and list their script files (single panel)
+            const root = document.querySelector('.main-content') || document;
+            const cells = root.querySelectorAll('.cell');
+            cells.forEach(cell => {
+                const header = cell.querySelector('.cell-header');
+                if (header) {
+                    const cellText = header.textContent.trim();
+                    const cellMatch = cellText.match(/Cell: ([a-zA-Z_][a-zA-Z0-9_]*)/);
+                    if (cellMatch) {
+                        const cellId = cellMatch[1];
+                        const scriptItem = document.createElement('div');
+                        scriptItem.className = 'file-explorer-item script';
+                        scriptItem.textContent = `${cellId}.py`;
+                        scriptItem.onclick = function() {
+                            cell.scrollIntoView({ behavior: 'smooth', block: 'start' });
+                        };
+                        scriptsSection.appendChild(scriptItem);
+                    }
+                }
+            });
+            
+            fileExplorer.appendChild(scriptsSection);
+            
+            // Artifacts section
+            const artifactsSection = document.createElement('div');
+            artifactsSection.className = 'file-explorer-section';
+            
+            const artifactsTitle = document.createElement('div');
+            artifactsTitle.className = 'file-explorer-section-title';
+            artifactsTitle.textContent = 'artifacts';
+            artifactsSection.appendChild(artifactsTitle);
+            
+            // Find all artifact links (single panel)
+            const artifactsRoot = document.querySelector('.main-content') || document;
+            const artifacts = artifactsRoot.querySelectorAll('.artifact');
+            if (artifacts.length === 0) {
+                const noArtifacts = document.createElement('div');
+                noArtifacts.className = 'file-explorer-item artifact';
+                noArtifacts.textContent = '(none)';
+                noArtifacts.style.opacity = '0.5';
+                artifactsSection.appendChild(noArtifacts);
+            } else {
+                artifacts.forEach(artifact => {
+                    const artifactItem = document.createElement('div');
+                    artifactItem.className = 'file-explorer-item artifact';
+                    artifactItem.textContent = artifact.textContent;
+                    artifactItem.onclick = function() {
+                        artifact.click();
+                    };
+                    artifactsSection.appendChild(artifactItem);
+                });
+            }
+            
+            fileExplorer.appendChild(artifactsSection);
+            
+            return fileExplorer;
+        }
+
+        function initStatusWidget() {
+            let el = document.querySelector('.status-widget');
+            if (!el) {
+                el = document.createElement('div');
+                el.className = 'status-widget';
+                el.id = 'status-widget';
+                el.textContent = 'ready — Esc';
+                document.body.appendChild(el);
+            }
+        }
+
+        // Tools widget
+        let _cursorX = 0;
+        let _cursorY = 0;
+        let _cursorVisible = false;
+        
+        function setActiveTool(tool) {
+            if (!tool || tool === 'none') {
+                document.body.dataset.tool = 'none';
+                localStorage.setItem('uvnote-active-tool', 'none');
+                setOverlayActive(false);
+                _cursorVisible = false;
+                // Remove active class from all tool buttons when deactivating
+                const toolButtons = document.querySelectorAll('.tools-widget .tool-button');
+                toolButtons.forEach(btn => btn.classList.remove('active'));
+                updateStateIndicator();
+                encodeToolStateToUrl();
+                return;
+            }
+            document.body.dataset.tool = tool;
+            localStorage.setItem('uvnote-active-tool', tool);
+            setOverlayActive(true);
+            _cursorVisible = true;
+            updateStateIndicator();
+            encodeToolStateToUrl();
+        }
+
+        // Make setActiveTool globally accessible for ESC key handler
+        window.setActiveTool = setActiveTool;
+        
+
+        function getArrowColor() {
+            const saved = localStorage.getItem('uvnote-arrow-color');
+            if (saved) return saved;
+            return '#e53935'; // Default red color
+        }
+
+        function setStoredArrowColor(color) {
+            try { localStorage.setItem('uvnote-arrow-color', color); } catch (_) {}
+        }
+
+        function getLineThickness() {
+            const saved = localStorage.getItem('uvnote-line-thickness');
+            if (saved) return parseInt(saved, 10);
+            return 6; // default thickness
+        }
+
+        function setStoredLineThickness(thickness) {
+            try { localStorage.setItem('uvnote-line-thickness', thickness); } catch (_) {}
+        }
+
+        function getFadeoutTime() {
+            const saved = localStorage.getItem('uvnote-fadeout-time');
+            if (saved) return parseInt(saved, 10);
+            return 5; // default 5 seconds
+        }
+
+        function setStoredFadeoutTime(seconds) {
+            try { localStorage.setItem('uvnote-fadeout-time', seconds); } catch (_) {}
+        }
+
+        function createToolsWidget() {
+            const tools = document.createElement('div');
+            tools.className = 'tools-widget';
+
+            const title = document.createElement('div');
+            title.className = 'tools-title';
+            title.textContent = 'tools';
+            tools.appendChild(title);
+
+            const row = document.createElement('div');
+            row.className = 'tools-row';
+            tools.appendChild(row);
+
+            // Arrow tool
+            const arrowBtn = document.createElement('div');
+            arrowBtn.className = 'tool-button';
+            arrowBtn.textContent = 'arrow';
+            arrowBtn.onclick = function() {
+                const isActive = arrowBtn.classList.contains('active');
+                if (isActive) {
+                    arrowBtn.classList.remove('active');
+                    setActiveTool('none');
+                } else {
+                    tools.querySelectorAll('.tool-button').forEach(b => b.classList.remove('active'));
+                    arrowBtn.classList.add('active');
+                    setActiveTool('arrow');
+                }
+            };
+            row.appendChild(arrowBtn);
+
+            // Pen tool
+            const penBtn = document.createElement('div');
+            penBtn.className = 'tool-button';
+            penBtn.textContent = 'pen';
+            penBtn.onclick = function() {
+                const isActive = penBtn.classList.contains('active');
+                if (isActive) {
+                    penBtn.classList.remove('active');
+                    setActiveTool('none');
+                } else {
+                    tools.querySelectorAll('.tool-button').forEach(b => b.classList.remove('active'));
+                    penBtn.classList.add('active');
+                    setActiveTool('pen');
+                }
+            };
+            row.appendChild(penBtn);
+
+            // Eraser tool
+            const eraseBtn = document.createElement('div');
+            eraseBtn.className = 'tool-button';
+            eraseBtn.textContent = 'eraser';
+            eraseBtn.onclick = function() {
+                const isActive = eraseBtn.classList.contains('active');
+                if (isActive) {
+                    eraseBtn.classList.remove('active');
+                    setActiveTool('none');
+                } else {
+                    tools.querySelectorAll('.tool-button').forEach(b => b.classList.remove('active'));
+                    eraseBtn.classList.add('active');
+                    setActiveTool('eraser');
+                }
+            };
+            row.appendChild(eraseBtn);
+
+            // Spotlight tool
+            const spotlightBtn = document.createElement('div');
+            spotlightBtn.className = 'tool-button';
+            spotlightBtn.textContent = 'spotlight';
+            spotlightBtn.onclick = function() {
+                const isActive = spotlightBtn.classList.contains('active');
+                if (isActive) {
+                    spotlightBtn.classList.remove('active');
+                    setActiveTool('none');
+                } else {
+                    tools.querySelectorAll('.tool-button').forEach(b => b.classList.remove('active'));
+                    spotlightBtn.classList.add('active');
+                    setActiveTool('spotlight');
+                }
+            };
+            row.appendChild(spotlightBtn);
+
+            // Clear all
+            const clearBtn = document.createElement('div');
+            clearBtn.className = 'tool-button';
+            clearBtn.textContent = 'clear';
+            clearBtn.onclick = function() {
+                _shapes = [];
+                saveShapes();
+                renderOverlay();
+            };
+            row.appendChild(clearBtn);
+            
+            // We'll add the copy button at the end of the widget
+
+            // Restore active state from storage
+            const saved = localStorage.getItem('uvnote-active-tool') || 'none';
+            if (saved === 'arrow') {
+                arrowBtn.classList.add('active');
+                setActiveTool('arrow');
+            } else if (saved === 'pen') {
+                penBtn.classList.add('active');
+                setActiveTool('pen');
+            } else if (saved === 'eraser') {
+                eraseBtn.classList.add('active');
+                setActiveTool('eraser');
+            } else if (saved === 'spotlight') {
+                spotlightBtn.classList.add('active');
+                setActiveTool('spotlight');
+            }
+
+            // Color selector
+            const colorTitle = document.createElement('div');
+            colorTitle.className = 'tools-section-title';
+            colorTitle.textContent = 'color';
+            tools.appendChild(colorTitle);
+
+            const colorRow = document.createElement('div');
+            colorRow.className = 'tools-row color-row';
+            tools.appendChild(colorRow);
+
+            const swatchColors = [
+                // Primary colors
+                '#e53935', '#fb8c00', '#fdd835', '#43a047', '#1e88e5', '#8e24aa',
+                // Additional useful colors  
+                '#ff5722', '#795548', '#607d8b', '#9c27b0',
+                // Grayscale
+                '#000000', '#424242', '#9e9e9e', '#ffffff'
+            ];
+            const swatches = [];
+            swatchColors.forEach(c => {
+                const s = document.createElement('div');
+                s.className = 'color-swatch';
+                s.style.backgroundColor = c;
+                s.title = c;
+                s.onclick = () => {
+                    setStoredArrowColor(c);
+                    refreshColorUI(c);
+                    if (_cursorVisible) renderOverlay();
+                    encodeToolStateToUrl();
+                };
+                colorRow.appendChild(s);
+                swatches.push(s);
+            });
+
+            const colorInput = document.createElement('input');
+            colorInput.type = 'color';
+            colorInput.className = 'color-input';
+            colorInput.oninput = () => {
+                setStoredArrowColor(colorInput.value);
+                refreshColorUI(colorInput.value);
+                if (_cursorVisible) renderOverlay();
+                encodeToolStateToUrl();
+            };
+            colorRow.appendChild(colorInput);
+
+            function refreshColorUI(selected) {
+                const selectedHex = selected.startsWith('#') ? selected.toLowerCase() : rgbToHex(selected);
+                
+                swatches.forEach((s, i) => {
+                    const swatchHex = swatchColors[i].toLowerCase();
+                    if (swatchHex === selectedHex) {
+                        s.classList.add('selected');
+                    } else {
+                        s.classList.remove('selected');
+                    }
+                });
+                
+                try { 
+                    colorInput.value = selectedHex; 
+                } catch (_) {}
+            }
+
+            function rgbToHex(rgb) {
+                const m = rgb.match(/rgba?\((\d+),\s*(\d+),\s*(\d+)\)/i);
+                if (!m) return '#000000';
+                const r = parseInt(m[1]).toString(16).padStart(2, '0');
+                const g = parseInt(m[2]).toString(16).padStart(2, '0');
+                const b = parseInt(m[3]).toString(16).padStart(2, '0');
+                return `#${r}${g}${b}`;
+            }
+
+            // Restore color selection
+            refreshColorUI(getArrowColor());
+
+            // Thickness slider
+            const thicknessTitle = document.createElement('div');
+            thicknessTitle.className = 'tools-section-title';
+            thicknessTitle.textContent = 'thickness';
+            tools.appendChild(thicknessTitle);
+
+            const thicknessRow = document.createElement('div');
+            thicknessRow.className = 'thickness-row';
+            tools.appendChild(thicknessRow);
+
+            const thicknessSlider = document.createElement('input');
+            thicknessSlider.type = 'range';
+            thicknessSlider.className = 'thickness-slider';
+            thicknessSlider.min = '1';
+            thicknessSlider.max = '10';
+            thicknessSlider.value = getLineThickness();
+            
+            const thicknessValue = document.createElement('span');
+            thicknessValue.className = 'thickness-value';
+            thicknessValue.textContent = thicknessSlider.value + 'px';
+
+            thicknessSlider.oninput = function() {
+                const value = parseInt(thicknessSlider.value, 10);
+                setStoredLineThickness(value);
+                thicknessValue.textContent = value + 'px';
+                if (_cursorVisible) renderOverlay();
+                encodeToolStateToUrl();
+            };
+
+            thicknessRow.appendChild(thicknessSlider);
+            thicknessRow.appendChild(thicknessValue);
+
+            // Fadeout time slider
+            const fadeoutTitle = document.createElement('div');
+            fadeoutTitle.className = 'tools-section-title';
+            fadeoutTitle.textContent = 'fadeout time';
+            tools.appendChild(fadeoutTitle);
+
+            const fadeoutRow = document.createElement('div');
+            fadeoutRow.className = 'thickness-row';
+            tools.appendChild(fadeoutRow);
+
+            const fadeoutSlider = document.createElement('input');
+            fadeoutSlider.type = 'range';
+            fadeoutSlider.className = 'thickness-slider';
+            fadeoutSlider.min = '0';
+            fadeoutSlider.max = '30';
+            fadeoutSlider.value = getFadeoutTime();
+            
+            const fadeoutValue = document.createElement('span');
+            fadeoutValue.className = 'thickness-value';
+            fadeoutValue.textContent = fadeoutSlider.value === '0' ? 'never' : fadeoutSlider.value + 's';
+
+            fadeoutSlider.oninput = function() {
+                const value = parseInt(fadeoutSlider.value, 10);
+                setStoredFadeoutTime(value);
+                fadeoutValue.textContent = value === 0 ? 'never' : value + 's';
+                encodeToolStateToUrl();
+            };
+
+            fadeoutRow.appendChild(fadeoutSlider);
+            fadeoutRow.appendChild(fadeoutValue);
+
+            // Draggable behavior
+            makeDraggable(tools, 'uvnote-tools-pos', title);
+
+            return tools;
+        }
+
+        function initTools() {
+            const widget = createToolsWidget();
+            document.body.appendChild(widget);
+        }
+
+        function teardownTools() {
+            const w = document.querySelector('.tools-widget');
+            if (w && w.parentNode) w.parentNode.removeChild(w);
+        }
+
+        // --- Canvas overlay for tools ---
+        let _overlay = null;
+        let _overlayCtx = null;
+        let _overlayContainer = null; // window
+        let _overlayMode = 'single';
+        let _overlayResizeHandler = null;
+        let _overlayScrollHandler = null;
+        let _drawing = null; // current in-progress arrow {x1,y1,x2,y2}
+        let _shapes = []; // committed shapes for current mode
+        let _fadeTimer = null; // timer for fade animation
+        let _urlLoadedTool = false; // track if tool was loaded from URL
+        let _isInitializing = true; // prevent URL updates during initialization
+        let _initialCellStates = {}; // track initial cell states from page load
+
+        function getOverlayStorageKey() { return 'uvnote-shapes'; }
+
+        function loadShapes() {
+            try {
+                const raw = localStorage.getItem(getOverlayStorageKey());
+                _shapes = raw ? JSON.parse(raw) : [];
+            } catch (_) { _shapes = []; }
+        }
+
+        function saveShapes() {
+            try { 
+                localStorage.setItem(getOverlayStorageKey(), JSON.stringify(_shapes));
+                // Always update URL when shapes change
+                encodeToolStateToUrl();
+            } catch (_) {}
+        }
+
+        function updateShapesFade() {
+            const now = Date.now();
+            let needsUpdate = false;
+
+            for (let i = _shapes.length - 1; i >= 0; i--) {
+                const shape = _shapes[i];
+                if (!shape.createdAt) continue; // Skip old shapes without timestamps
+                
+                // Use individual shape's fadeout time, or global if not set
+                const shapesFadeoutSeconds = shape.fadeoutTime !== undefined ? shape.fadeoutTime : getFadeoutTime();
+                
+                // Skip fading if fadeout is disabled for this shape
+                if (shapesFadeoutSeconds === 0) continue;
+                
+                const fadeStartTime = Math.max(0, (shapesFadeoutSeconds - 2) * 1000); // Start fading 2s before end
+                const fadeEndTime = shapesFadeoutSeconds * 1000; // Fully gone after specified time
+                const age = now - shape.createdAt;
+                
+                if (age >= fadeEndTime) {
+                    // Remove completely faded shapes
+                    _shapes.splice(i, 1);
+                    needsUpdate = true;
+                } else if (age >= fadeStartTime) {
+                    // Update opacity for fading shapes
+                    const fadeProgress = (age - fadeStartTime) / (fadeEndTime - fadeStartTime);
+                    const newOpacity = 1 - fadeProgress;
+                    if (Math.abs(shape.opacity - newOpacity) > 0.01) {
+                        shape.opacity = newOpacity;
+                        needsUpdate = true;
+                    }
+                }
+            }
+
+            if (needsUpdate) {
+                saveShapes();
+                renderOverlay();
+                // Update URL to remove faded shapes  
+                encodeToolStateToUrl();
+            }
+        }
+
+        function getContentContainer() { return window; }
+
+        function updateOverlayModeAndContainer() {
+            _overlayContainer = window;
+            _overlayMode = 'single';
+        }
+
+        function updateOverlayBounds() {
+            if (!_overlay) return;
+            if (_overlayContainer === window) {
+                _overlay.style.position = 'fixed';
+                _overlay.style.left = '0px';
+                _overlay.style.top = '0px';
+                _overlay.width = window.innerWidth;
+                _overlay.height = window.innerHeight;
+            } else {
+                const rect = _overlayContainer.getBoundingClientRect();
+                _overlay.style.position = 'fixed';
+                _overlay.style.left = rect.left + 'px';
+                _overlay.style.top = rect.top + 'px';
+                _overlay.width = Math.max(0, Math.floor(rect.width));
+                _overlay.height = Math.max(0, Math.floor(rect.height));
+            }
+            renderOverlay();
+        }
+
+        function containerScrollLeft() {
+            return (_overlayContainer === window) ? (window.scrollX || 0) : (_overlayContainer.scrollLeft || 0);
+        }
+        function containerScrollTop() {
+            return (_overlayContainer === window) ? (window.scrollY || 0) : (_overlayContainer.scrollTop || 0);
+        }
+
+        function toCanvasCoords(clientX, clientY) {
+            const rect = _overlay.getBoundingClientRect();
+            return { x: clientX - rect.left, y: clientY - rect.top };
+        }
+
+        function onPointerDown(e) {
+            const tool = document.body.dataset.tool;
+            if (tool === 'arrow') {
+                startDrawArrow(e);
+            } else if (tool === 'pen') {
+                startDrawPen(e);
+            } else if (tool === 'eraser') {
+                eraseAt(e);
+            } else if (tool === 'spotlight') {
+                startDrawSpotlight(e);
+            }
+        }
+
+        function onPointerMove(e) {
+            // Update cursor position
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _cursorX = pt.x;
+            _cursorY = pt.y;
+            
+            if (!_drawing) {
+                // Just update cursor position and re-render
+                if (_cursorVisible) {
+                    renderOverlay();
+                }
+                return;
+            }
+            
+            if (_drawing.type === 'pen') {
+                moveDrawPen(e);
+            } else if (_drawing.type === 'spotlight') {
+                moveDrawSpotlight(e);
+            } else {
+                moveDrawArrow(e);
+            }
+        }
+        
+        function onPointerEnter(e) {
+            _cursorVisible = document.body.dataset.tool !== 'none';
+            if (_cursorVisible) {
+                renderOverlay();
+            }
+        }
+        
+        function onPointerLeave(e) {
+            _cursorVisible = false;
+            renderOverlay();
+        }
+
+        function onPointerUp(e) {
+            if (!_drawing) return;
+            if (_drawing.type === 'pen') {
+                endDrawPen();
+            } else if (_drawing.type === 'spotlight') {
+                endDrawSpotlight();
+            } else {
+                endDrawArrow();
+            }
+        }
+
+        function startDrawArrow(e) {
+            if (document.body.dataset.tool !== 'arrow') return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _drawing = {
+                x1: pt.x + containerScrollLeft(),
+                y1: pt.y + containerScrollTop(),
+                x2: pt.x + containerScrollLeft(),
+                y2: pt.y + containerScrollTop(),
+                color: getArrowColor(),
+                width: getLineThickness()
+            };
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function moveDrawArrow(e) {
+            if (!_drawing) return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _drawing.x2 = pt.x + containerScrollLeft();
+            _drawing.y2 = pt.y + containerScrollTop();
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function endDrawArrow() {
+            if (!_drawing) return;
+            _shapes.push({ 
+                type: 'arrow', 
+                ..._drawing,
+                createdAt: Date.now(),
+                fadeoutTime: getFadeoutTime(),
+                opacity: 1.0
+            });
+            _drawing = null;
+            saveShapes();
+            renderOverlay();
+        }
+
+        function startDrawPen(e) {
+            if (document.body.dataset.tool !== 'pen') return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _drawing = {
+                type: 'pen',
+                points: [{
+                    x: pt.x + containerScrollLeft(),
+                    y: pt.y + containerScrollTop()
+                }],
+                color: getArrowColor(),
+                width: getLineThickness()
+            };
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function moveDrawPen(e) {
+            if (!_drawing || _drawing.type !== 'pen') return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _drawing.points.push({
+                x: pt.x + containerScrollLeft(),
+                y: pt.y + containerScrollTop()
+            });
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function endDrawPen() {
+            if (!_drawing || _drawing.type !== 'pen') return;
+            if (_drawing.points.length > 1) {
+                _shapes.push({ 
+                    ..._drawing,
+                    createdAt: Date.now(),
+                    fadeoutTime: getFadeoutTime(),
+                    opacity: 1.0
+                });
+            }
+            _drawing = null;
+            saveShapes();
+            renderOverlay();
+        }
+
+        function startDrawSpotlight(e) {
+            if (document.body.dataset.tool !== 'spotlight') return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _drawing = {
+                type: 'spotlight',
+                x: pt.x + containerScrollLeft(),
+                y: pt.y + containerScrollTop(),
+                radius: getLineThickness() * 20, // Use thickness to control spotlight size (bigger default)
+                color: getArrowColor()
+            };
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function moveDrawSpotlight(e) {
+            if (!_drawing || _drawing.type !== 'spotlight') return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            const dx = pt.x + containerScrollLeft() - _drawing.x;
+            const dy = pt.y + containerScrollTop() - _drawing.y;
+            _drawing.radius = Math.max(20, Math.sqrt(dx * dx + dy * dy)); // Minimum radius of 20
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function endDrawSpotlight() {
+            if (!_drawing || _drawing.type !== 'spotlight') return;
+            _shapes.push({ 
+                ..._drawing,
+                createdAt: Date.now(),
+                fadeoutTime: getFadeoutTime(),
+                opacity: 1.0
+            });
+            _drawing = null;
+            saveShapes();
+            renderOverlay();
+        }
+
+        function distPointToSegment(px, py, x1, y1, x2, y2) {
+            const dx = x2 - x1, dy = y2 - y1;
+            if (dx === 0 && dy === 0) return Math.hypot(px - x1, py - y1);
+            const t = Math.max(0, Math.min(1, ((px - x1) * dx + (py - y1) * dy) / (dx*dx + dy*dy)));
+            const cx = x1 + t * dx, cy = y1 + t * dy;
+            return Math.hypot(px - cx, py - cy);
+        }
+
+        function eraseAt(e) {
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            const x = pt.x + containerScrollLeft();
+            const y = pt.y + containerScrollTop();
+            const threshold = 10; // pixels
+            for (let i = _shapes.length - 1; i >= 0; i--) {
+                const s = _shapes[i];
+                if (s.type === 'arrow') {
+                    const d = distPointToSegment(x, y, s.x1, s.y1, s.x2, s.y2);
+                    if (d <= threshold) {
+                        _shapes.splice(i, 1);
+                        saveShapes();
+                        renderOverlay();
+                        break;
+                    }
+                } else if (s.type === 'pen' && s.points) {
+                    // Check if click is near any line segment in the pen stroke
+                    let minDist = Infinity;
+                    for (let j = 1; j < s.points.length; j++) {
+                        const d = distPointToSegment(x, y, s.points[j-1].x, s.points[j-1].y, s.points[j].x, s.points[j].y);
+                        minDist = Math.min(minDist, d);
+                    }
+                    if (minDist <= threshold) {
+                        _shapes.splice(i, 1);
+                        saveShapes();
+                        renderOverlay();
+                        break;
+                    }
+                }
+            }
+            e.preventDefault();
+        }
+
+        function drawArrow(ctx, x1, y1, x2, y2, color, width, opacity = 1.0) {
+            // Set opacity
+            const oldAlpha = ctx.globalAlpha;
+            ctx.globalAlpha = opacity;
+            
+            ctx.strokeStyle = color;
+            ctx.fillStyle = color;
+            ctx.lineWidth = width;
+            ctx.lineCap = 'round';
+            ctx.lineJoin = 'round';
+            
+            // Check if points are too close (initial state)
+            const dx = x2 - x1;
+            const dy = y2 - y1;
+            const distance = Math.sqrt(dx * dx + dy * dy);
+            
+            if (distance < 5) {
+                // Draw just a small arrowhead pointing down-right when first clicked
+                const defaultAngle = Math.PI / 4; // 45 degrees (down-right)
+                const headLength = Math.min(15 + width * 1.5, 25);
+                const headAngle = Math.PI / 6;
+                
+                // Calculate arrowhead points
+                const hx1 = x1 + headLength * Math.cos(defaultAngle - headAngle);
+                const hy1 = y1 + headLength * Math.sin(defaultAngle - headAngle);
+                const hx2 = x1 + headLength * Math.cos(defaultAngle + headAngle);
+                const hy2 = y1 + headLength * Math.sin(defaultAngle + headAngle);
+                
+                // Draw arrowhead only
+                ctx.beginPath();
+                ctx.moveTo(x1, y1);
+                ctx.lineTo(hx1, hy1);
+                ctx.lineTo(hx2, hy2);
+                ctx.closePath();
+                ctx.fill();
+            } else {
+                // Normal arrow drawing - head at x1,y1, tail at x2,y2
+                const angle = Math.atan2(y1 - y2, x1 - x2);
+                const headLength = Math.min(15 + width * 1.5, 25);
+                const headAngle = Math.PI / 6;
+                
+                // Calculate where the line should end (before the arrowhead)
+                const lineEndX = x1 - headLength * 0.8 * Math.cos(angle);
+                const lineEndY = y1 - headLength * 0.8 * Math.sin(angle);
+                
+                // Draw the line from tail to near the head
+                ctx.beginPath();
+                ctx.moveTo(x2, y2);
+                ctx.lineTo(lineEndX, lineEndY);
+                ctx.stroke();
+                
+                // Calculate arrowhead points
+                const hx1 = x1 - headLength * Math.cos(angle - headAngle);
+                const hy1 = y1 - headLength * Math.sin(angle - headAngle);
+                const hx2 = x1 - headLength * Math.cos(angle + headAngle);
+                const hy2 = y1 - headLength * Math.sin(angle + headAngle);
+                
+                // Draw arrowhead
+                ctx.beginPath();
+                ctx.moveTo(x1, y1);
+                ctx.lineTo(hx1, hy1);
+                ctx.lineTo(hx2, hy2);
+                ctx.closePath();
+                ctx.fill();
+            }
+            
+            // Restore opacity
+            ctx.globalAlpha = oldAlpha;
+        }
+
+        function drawPen(ctx, points, color, width, offX, offY, opacity = 1.0) {
+            if (!points || points.length < 2) return;
+            
+            // Set opacity
+            const oldAlpha = ctx.globalAlpha;
+            ctx.globalAlpha = opacity;
+            
+            ctx.strokeStyle = color;
+            ctx.lineWidth = width;
+            ctx.lineCap = 'round';
+            ctx.lineJoin = 'round';
+            ctx.beginPath();
+            ctx.moveTo(points[0].x - offX, points[0].y - offY);
+            for (let i = 1; i < points.length; i++) {
+                ctx.lineTo(points[i].x - offX, points[i].y - offY);
+            }
+            ctx.stroke();
+            
+            // Restore opacity
+            ctx.globalAlpha = oldAlpha;
+        }
+
+        function drawAllSpotlights(ctx, spotlights, offX, offY) {
+            if (!spotlights || spotlights.length === 0) return;
+            
+            ctx.save();
+            
+            // Calculate the overall opacity based on all spotlights
+            const maxOpacity = Math.max(...spotlights.map(s => s.opacity || 1.0));
+            
+            // Fill entire canvas with dark overlay
+            ctx.fillStyle = `rgba(0, 0, 0, ${0.7 * maxOpacity})`;
+            ctx.fillRect(0, 0, ctx.canvas.width, ctx.canvas.height);
+            
+            // Cut out completely transparent holes for all spotlights
+            ctx.globalCompositeOperation = 'destination-out';
+            ctx.fillStyle = 'rgba(0, 0, 0, 1)'; // Solid black to ensure complete removal
+            for (const spotlight of spotlights) {
+                ctx.beginPath();
+                ctx.arc(spotlight.x - offX, spotlight.y - offY, spotlight.radius, 0, 2 * Math.PI);
+                ctx.fill();
+            }
+            
+            ctx.restore();
+        }
+
+        function renderOverlay() {
+            if (!_overlay || !_overlayCtx) return;
+            _overlayCtx.clearRect(0, 0, _overlay.width, _overlay.height);
+            const offX = containerScrollLeft();
+            const offY = containerScrollTop();
+            // Draw non-spotlight shapes first
+            for (const s of _shapes) {
+                const opacity = s.opacity !== undefined ? s.opacity : 1.0;
+                if (s.type === 'arrow') {
+                    drawArrow(_overlayCtx, s.x1 - offX, s.y1 - offY, s.x2 - offX, s.y2 - offY, s.color || '#f00', s.width || 2, opacity);
+                } else if (s.type === 'pen') {
+                    drawPen(_overlayCtx, s.points, s.color || '#f00', s.width || 2, offX, offY, opacity);
+                }
+            }
+            // Draw current drawing (non-spotlight)
+            if (_drawing) {
+                if (_drawing.type === 'pen') {
+                    drawPen(_overlayCtx, _drawing.points, _drawing.color, _drawing.width, offX, offY);
+                } else if (_drawing.type !== 'spotlight') {
+                    drawArrow(_overlayCtx, _drawing.x1 - offX, _drawing.y1 - offY, _drawing.x2 - offX, _drawing.y2 - offY, _drawing.color, _drawing.width);
+                }
+            }
+            
+            // Collect all spotlights (existing + current drawing + cursor preview)
+            const spotlights = [];
+            
+            // Add existing spotlight shapes
+            for (const s of _shapes) {
+                if (s.type === 'spotlight') {
+                    spotlights.push({
+                        x: s.x,
+                        y: s.y, 
+                        radius: s.radius,
+                        opacity: s.opacity !== undefined ? s.opacity : 1.0
+                    });
+                }
+            }
+            
+            // Add current spotlight being drawn
+            if (_drawing && _drawing.type === 'spotlight') {
+                spotlights.push({
+                    x: _drawing.x,
+                    y: _drawing.y,
+                    radius: _drawing.radius,
+                    opacity: 1.0
+                });
+            }
+            
+            // Add cursor preview spotlight if tool is active
+            if (_cursorVisible && !_drawing) {
+                const tool = document.body.dataset.tool;
+                if (tool === 'spotlight') {
+                    const thickness = getLineThickness();
+                    const radius = thickness * 20;
+                    const cursorWorldX = _cursorX + containerScrollLeft();
+                    const cursorWorldY = _cursorY + containerScrollTop();
+                    spotlights.push({
+                        x: cursorWorldX,
+                        y: cursorWorldY,
+                        radius: radius,
+                        opacity: 0.8
+                    });
+                }
+            }
+            
+            // Draw all spotlights as a single overlay with multiple holes
+            drawAllSpotlights(_overlayCtx, spotlights, offX, offY);
+            
+            // Draw cursor indicators for non-spotlight tools
+            if (_cursorVisible && !_drawing) {
+                const tool = document.body.dataset.tool;
+                const color = getArrowColor();
+                const thickness = getLineThickness();
+                
+                if (tool !== 'spotlight') {
+                    _overlayCtx.save();
+                    _overlayCtx.fillStyle = color;
+                    _overlayCtx.globalAlpha = 0.7;
+                    
+                    if (tool === 'eraser') {
+                        // Draw eraser indicator
+                        _overlayCtx.strokeStyle = color;
+                        _overlayCtx.lineWidth = 2;
+                        _overlayCtx.beginPath();
+                        _overlayCtx.arc(_cursorX, _cursorY, 10, 0, 2 * Math.PI);
+                        _overlayCtx.stroke();
+                    } else {
+                        // Draw dot for pen/arrow
+                        _overlayCtx.beginPath();
+                        _overlayCtx.arc(_cursorX, _cursorY, thickness / 2, 0, 2 * Math.PI);
+                        _overlayCtx.fill();
+                    }
+                    
+                    _overlayCtx.restore();
+                }
+            }
+        }
+
+        function setOverlayActive(active) {
+            if (!_overlay) initOverlay();
+            _overlay.style.pointerEvents = active ? 'auto' : 'none';
+            _overlay.style.cursor = active ? 'none' : 'auto';
+            // Re-render to ensure visibility aligns with content
+            renderOverlay();
+        }
+
+        function initOverlay() {
+            if (_overlay) return;
+            updateOverlayModeAndContainer();
+            _overlay = document.createElement('canvas');
+            _overlay.className = 'draw-overlay';
+            _overlayCtx = _overlay.getContext('2d');
+            document.body.appendChild(_overlay);
+            updateOverlayBounds();
+            loadShapes();
+            renderOverlay();
+
+            // Events
+            _overlay.addEventListener('mousedown', onPointerDown);
+            _overlay.addEventListener('mousemove', onPointerMove);
+            _overlay.addEventListener('mouseenter', onPointerEnter);
+            _overlay.addEventListener('mouseleave', onPointerLeave);
+            document.addEventListener('mouseup', onPointerUp);
+            _overlay.addEventListener('touchstart', onPointerDown, { passive: false });
+            _overlay.addEventListener('touchmove', onPointerMove, { passive: false });
+            document.addEventListener('touchend', onPointerUp);
+
+            _overlayResizeHandler = () => updateOverlayBounds();
+            window.addEventListener('resize', _overlayResizeHandler);
+
+            _overlayScrollHandler = () => renderOverlay();
+            window.addEventListener('scroll', _overlayScrollHandler);
+            
+            // Start fade animation timer
+            _fadeTimer = setInterval(updateShapesFade, 100); // Update every 100ms for smooth fade
+        }
+
+        function rebindOverlayContainer() {
+            if (!_overlay) return;
+            // Remove old scroll handler
+            if (_overlayScrollHandler) { window.removeEventListener('scroll', _overlayScrollHandler); }
+            updateOverlayModeAndContainer();
+            updateOverlayBounds();
+            loadShapes();
+            renderOverlay();
+            _overlayScrollHandler = () => renderOverlay();
+            window.addEventListener('scroll', _overlayScrollHandler);
+        }
+
+        function teardownOverlay() {
+            if (!_overlay) return;
+            _overlay.removeEventListener('mousedown', onPointerDown);
+            _overlay.removeEventListener('mousemove', onPointerMove);
+            _overlay.removeEventListener('mouseenter', onPointerEnter);
+            _overlay.removeEventListener('mouseleave', onPointerLeave);
+            document.removeEventListener('mouseup', onPointerUp);
+            _overlay.removeEventListener('touchstart', onPointerDown);
+            _overlay.removeEventListener('touchmove', onPointerMove);
+            document.removeEventListener('touchend', onPointerUp);
+            if (_overlayResizeHandler) window.removeEventListener('resize', _overlayResizeHandler);
+            if (_overlayScrollHandler) {
+                if (_overlayContainer === window) {
+                    window.removeEventListener('scroll', _overlayScrollHandler);
+                } else if (_overlayContainer) {
+                    _overlayContainer.removeEventListener('scroll', _overlayScrollHandler);
+                }
+            }
+            if (_fadeTimer) {
+                clearInterval(_fadeTimer);
+                _fadeTimer = null;
+            }
+            if (_overlay.parentNode) _overlay.parentNode.removeChild(_overlay);
+            _overlay = null; _overlayCtx = null; _overlayContainer = null; _overlayResizeHandler = null; _overlayScrollHandler = null; _drawing = null;
+        }
+        
+        function teardownFileExplorer() {
+            const fe = document.querySelector('.file-explorer');
+            if (fe && fe.parentNode) fe.parentNode.removeChild(fe);
+        }
+
+        function escapeHtml(text) {
+            const div = document.createElement('div');
+            div.textContent = text;
+            return div.innerHTML;
+        }
+
+        function runCell(cellId){
+            const btn=document.querySelector('.run-btn[onclick*="'+cellId+'"]');
+            const output=document.getElementById('output-'+cellId);
+            if(btn){btn.textContent='⏳ running...';btn.disabled=true;}
+            if(output){output.classList.add('output-stale');}
+            fetch('/run/'+cellId,{method:'POST'}).then(r=>r.json()).then(data=>{
+                if(output){
+                    output.classList.remove('output-stale');
+                    let html='';
+                    if (data.stdout) {
+                    html += '<div class="cell-stdout"><pre class="stdout-text">'
+                        + escapeHtml(data.stdout)
+                        + '</pre></div>';
+                    }
+
+                    console.log('UV Logs:', data);
+                    if(data.stderr) {
+                        // Split UV logs from regular stderr
+                        const lines = data.stderr.split('\\n');
+                        let uvLogs = [];
+                        let regularLogs = [];
+                        let inUvSection = true;
+
+                        for (const line of lines) {
+                            if (inUvSection) {
+                                uvLogs.push(line);
+                                if (line.startsWith('Installed ')) {
+                                    inUvSection = false;
+                                }
+                            } else {
+                                regularLogs.push(line);
+                            }
+                        }
+                        
+
+                        // If we never found "Installed", treat it all as regular stderr
+                        if (inUvSection) {
+                            html+='<div class="cell-stderr">'+escapeHtml(data.stderr)+'</div>';
+                        } else {
+                            const uvLogsStr = uvLogs.join('\\n');
+                            const regularLogsStr = regularLogs.join('\\n').trim();
+
+                            if (uvLogsStr) {
+                                html+='<div class="uv-install-logs">';
+                                html+='<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>';
+                                html+='<div class="uv-logs-content" style="display: none;">'+escapeHtml(uvLogsStr)+'</div>';
+                                html+='</div>';
+                            }
+                            if (regularLogsStr) {
+                                html+='<div class="cell-stderr">'+escapeHtml(regularLogsStr)+'</div>';
+                            }
+                        }
+                    }
+                    output.innerHTML=html;
+                }
+                if(btn){btn.textContent='▶ run';btn.disabled=false;}
+            }).catch(e=>{
+                console.error('Run failed:',e);
+                if(output){output.classList.remove('output-stale');}
+                if(btn){btn.textContent='▶ run';btn.disabled=false;}
+            });
+        }
+
+        function copyCell(cellId){
+            // Try multiple selectors to find the code element
+            // Pygments generates .highlight > pre with spans, not wrapped in <code>
+            let codeElement = document.querySelector('#code-'+cellId+' .highlight pre');
+            if (!codeElement) {
+                codeElement = document.querySelector('#code-'+cellId+' pre');
+            }
+            if (!codeElement) {
+                codeElement = document.querySelector('#code-'+cellId+' code');
+            }
+            if (!codeElement) {
+                // Fallback to the code div itself
+                codeElement = document.getElementById('code-'+cellId);
+            }
+
+            const btn = document.querySelector('.copy-btn[onclick*="'+cellId+'"]');
+
+            if (!codeElement) {
+                console.error('Code element not found for cell:', cellId);
+                return;
+            }
+            if (!btn) {
+                console.error('Copy button not found for cell:', cellId);
+                return;
+            }
+
+            const codeText = codeElement.textContent;
+            
+            if (navigator.clipboard && navigator.clipboard.writeText) {
+                navigator.clipboard.writeText(codeText).then(function() {
+                    console.log('Clipboard copy successful');
+                    btn.textContent = '✓ Copied!';
+                    btn.classList.add('copied');
+                    setTimeout(function() {
+                        btn.textContent = 'Copy';
+                        btn.classList.remove('copied');
+                    }, 2000);
+                }).catch(function(err) {
+                    console.warn('Clipboard copy failed:', err);
+                    fallbackCopy();
+                });
+            } else {
+                console.log('Using fallback copy method');
+                fallbackCopy();
+            }
+            
+            function fallbackCopy() {
+                const textarea = document.createElement('textarea');
+                textarea.value = codeText;
+                textarea.style.position = 'absolute';
+                textarea.style.left = '-9999px';
+                document.body.appendChild(textarea);
+                textarea.select();
+                try {
+                    const success = document.execCommand('copy');
+                    console.log('Fallback copy success:', success);
+                    btn.textContent = '✓ Copied!';
+                    btn.classList.add('copied');
+                    setTimeout(function() {
+                        btn.textContent = 'Copy';
+                        btn.classList.remove('copied');
+                    }, 2000);
+                } catch (err) {
+                    console.error('Fallback copy failed:', err);
+                    btn.textContent = 'Copy failed';
+                    setTimeout(function() {
+                        btn.textContent = 'Copy';
+                    }, 2000);
+                }
+                document.body.removeChild(textarea);
+            }
+        }
+
+        // // Live reload functionality (robust SSE handling)
+        // (function(){
+        //     if (!('EventSource' in window)) {
+        //         console.warn('SSE not supported in this browser');
+        //         return;
+        //     }
+        //     let source = new EventSource('/events');
+        //     let isOpen = false;
+        //     source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
+        //     source.onmessage = function(e){
+        //         const msg=(e.data||'').trim(); if(!msg) return;
+        //         console.log('SSE message:', msg);
+        //         if (msg==='reload' || msg==='incremental') { location.reload(); }
+        //         // Ignore 'loading' to avoid premature reload loops
+        //     };
+        //     source.onerror = function(e){
+        //         // Let EventSource auto-reconnect instead of forcing a reload
+        //         if (isOpen) console.warn('SSE error after open, retrying...', e);
+        //     };
+        //     window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
+        // })();
+
+
+        document.addEventListener('DOMContentLoaded', function() {
+            // Capture initial cell states before any modifications
+            captureInitialCellStates();
+            
+            updateThemeIcon();
+            updateUiMenu();
+            updateUiDebug();
+            const widgetsEnabled = (document.documentElement.getAttribute('data-widgets') || 'on') === 'on';
+            if (widgetsEnabled) {
+            initMinimap();
+            initFileExplorer();
+            initTools();
+            initOverlay();
+            initStatusWidget();
+            initializeWidgetVisibility();
+            layoutWidgetsStackedBottomRight();
+            window.addEventListener('resize', layoutWidgetsStackedBottomRight);
+            }
+
+            // Apply deep-link selection if present
+            applyLocationFromUrl();
+            updateStateIndicator();
+            
+            // Apply cell states from URL immediately
+            const url = new URL(window.location.href);
+            const encodedCellStates = url.searchParams.get('cells');
+            if (encodedCellStates) {
+                console.log('Applying cell states from URL...');
+                const cellStates = decodeCellStatesFromUrl(encodedCellStates);
+                
+                // Use requestAnimationFrame to ensure DOM is ready
+                requestAnimationFrame(() => {
+                    applyCellStatesFromUrl(cellStates);
+                    
+                    // Clear initialization flag after cell states are applied
+                    if (typeof _isInitializing !== 'undefined') {
+                        _isInitializing = false;
+                    }
+                });
+            } else {
+                // Clear initialization flag even if no cell states
+                if (typeof _isInitializing !== 'undefined') {
+                    requestAnimationFrame(() => {
+                        _isInitializing = false;
+                    });
+                }
+            }
+
+            // Bind drag selection on line numbers
+            document.addEventListener('mousedown', onLineNumberMouseDown);
+            document.addEventListener('mousemove', onDocMouseMove);
+            document.addEventListener('mouseup', onDocMouseUp);
+
+            // Add ESC key handler to exit tools
+            document.addEventListener('keydown', function(e) {
+                if (e.key === 'Escape' || e.keyCode === 27) {
+                    const currentTool = document.body.dataset.tool;
+                    if (currentTool && currentTool !== 'none') {
+                        // Deactivate the current tool
+                        window.setActiveTool('none');
+                    }
+                    // Also clear any active line selection
+                    clearSelection(true);
+                }
+            });
+        });
+
+        function updateStateIndicator() {
+            try {
+                const el = document.getElementById('status-widget');
+                if (!el) return;
+                const tool = document.body.dataset.tool || 'none';
+                if (tool && tool !== 'none') {
+                    el.textContent = `tool: ${tool} — Esc`;
+                    return;
+                }
+                if (_selection) {
+                    const t = _selection.a === _selection.b ? `L${_selection.a}` : `L${_selection.a}-${_selection.b}`;
+                    el.textContent = `selected: ${t} — Esc`;
+                    return;
+                }
+                el.textContent = 'ready — Esc';
+            } catch (_) {}
+        }
+    </script>
+</head>
+
+
+<body>
+    <div class="controls">
+        <div class="controls-buttons">
+            
+            <a href="index.html" class="back-button">← back</a>
+            
+            <div class="theme-toggle" onclick="toggleTheme()">light</div>
+            <div class="reset-toggle" onclick="resetLayout()">reset</div>
+            <div class="menu-button" onclick="toggleMenu()">
+                menu ▼
+                <div class="menu-dropdown">
+                    <div class="menu-item" onclick="setUiTheme('default')">
+                        <span class="menu-checkbox" id="checkbox-ui-default">☑</span> Theme: default
+                    </div>
+                    <div class="menu-item" onclick="setUiTheme('none')">
+                        <span class="menu-checkbox" id="checkbox-ui-none">☐</span> Theme: none
+                    </div>
+                    <div class="menu-item" onclick="setUiTheme('monocolor')">
+                        <span class="menu-checkbox" id="checkbox-ui-monocolor">☐</span> Theme: monocolor
+                    </div>
+                    <div class="menu-item" onclick="toggleWidget('tools')">
+                        <span class="menu-checkbox" id="checkbox-tools">☐</span> Tools
+                    </div>
+                    <div class="menu-item" onclick="toggleWidget('file-explorer')">
+                        <span class="menu-checkbox" id="checkbox-file-explorer">☐</span> File Explorer
+                    </div>
+                    <div class="menu-item" onclick="toggleWidget('minimap')">
+                        <span class="menu-checkbox" id="checkbox-minimap">☐</span> Table of Contents
+                    </div>
+                    <div class="menu-item" onclick="toggleWidget('status')">
+                        <span class="menu-checkbox" id="checkbox-status">☑</span> Status Indicator
+                    </div>
+                </div>
+            </div>
+        </div>
+    </div>
+    
+    <div class="system-info">
+        <div class="system-info-header">Generated on:</div>
+        <div class="system-info-content">
+            Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
+        </div>
+    </div>
+    
+    <div class="main-content">
+        <h1>GptOssExperts - OpenAI-style MoE</h1>
+<h2>GPU Info</h2>
+<div class="cell" id="cell-nv">
+<div class="cell-header">
+<span class="collapse-indicators">
+<span onclick="toggleCode('nv')" style="cursor: pointer;">▼ code</span> 
+<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
+ <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
+</span> | 
+Cell: nv | 0.24s
+ | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
+<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
+<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
+<a href="https://github.com/huggingface/kernels-benchmarks/blob/main/openai_moe/impls/gpt_oss_moe.md" target="_blank" class="github-btn">GitHub</a>
+<a href="https://huggingface.co/drbh/yamoe" target="_blank" class="hf-btn">🤗 HF</a>
+</div>
+<div id="code-nv" class="cell-code" data-lines="2">
+<div class="code-wrap">
+<div class="highlight"><pre><span></span><span class="kn">import</span><span class="w"> </span><span class="nn">subprocess</span>
+<span class="nb">print</span><span class="p">(</span><span class="n">subprocess</span><span class="o">.</span><span class="n">run</span><span class="p">([</span><span class="s2">&quot;nvidia-smi&quot;</span><span class="p">],</span> <span class="n">capture_output</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">text</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span><span class="o">.</span><span class="n">stdout</span><span class="p">)</span>
+</pre></div>
+
+<div class="code-line-highlight" id="line-highlight-nv"></div>
+</div>
+</div>
+<div id="output-nv" class="cell-output">
+<div class="cell-stdout"><pre class="stdout-text">Fri Oct 31 20:00:34 2025       
++-----------------------------------------------------------------------------------------+
+| NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
+|-----------------------------------------+------------------------+----------------------+
+| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
+| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
+|                                         |                        |               MIG M. |
+|=========================================+========================+======================|
+|   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
+| N/A   34C    P0             81W /  350W |       0MiB /  46068MiB |     18%      Default |
+|                                         |                        |                  N/A |
++-----------------------------------------+------------------------+----------------------+
+
++-----------------------------------------------------------------------------------------+
+| Processes:                                                                              |
+|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
+|        ID   ID                                                               Usage      |
+|=========================================================================================|
+|  No running processes found                                                             |
++-----------------------------------------------------------------------------------------+
+
+</pre></div>
+</div>
+</div>
+
+<h2>OpenAI-style MoE Benchmark (GptOssExperts Reference)</h2>
+<div class="cell" id="cell-benchmark">
+<div class="cell-header">
+<span class="collapse-indicators">
+<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span> 
+<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
+ <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
+</span> | 
+Cell: benchmark | 24.32s
+ | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
+<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
+<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
+<a href="https://github.com/huggingface/kernels-benchmarks/blob/main/openai_moe/impls/gpt_oss_moe.md" target="_blank" class="github-btn">GitHub</a>
+<a href="https://huggingface.co/drbh/yamoe" target="_blank" class="hf-btn">🤗 HF</a>
+</div>
+<div id="code-benchmark" class="cell-code" data-lines="81">
+<div class="code-wrap">
+<div class="highlight"><pre><span></span><span class="c1"># /// script</span>
+<span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
+<span class="c1"># dependencies = [</span>
+<span class="c1">#     &quot;numpy&quot;,</span>
+<span class="c1">#     &quot;torch==2.8.0&quot;,</span>
+<span class="c1">#     &quot;kernels-benchmark-tools&quot;,</span>
+<span class="c1">#     &quot;kernels&quot;,</span>
+<span class="c1"># ]</span>
+<span class="c1">#</span>
+<span class="c1"># [tool.uv.sources]</span>
+<span class="c1"># kernels-benchmark-tools = { path = &quot;../../../../../tools&quot;, editable = true }</span>
+<span class="c1"># kernels = { git = &quot;https://github.com/huggingface/kernels.git&quot; }</span>
+<span class="c1"># ///</span>
+<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
+<span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
+<span class="kn">from</span><span class="w"> </span><span class="nn">kernels_benchmark_tools</span><span class="w"> </span><span class="kn">import</span> <span class="n">KernelTypeEnum</span><span class="p">,</span> <span class="n">run_benchmark</span>
+<span class="kn">from</span><span class="w"> </span><span class="nn">kernels</span><span class="w"> </span><span class="kn">import</span> <span class="n">get_kernel</span>
+
+<span class="c1"># Load yamoe to get GptOssExperts reference</span>
+<span class="n">yamoe</span> <span class="o">=</span> <span class="n">get_kernel</span><span class="p">(</span><span class="s2">&quot;drbh/yamoe&quot;</span><span class="p">,</span> <span class="n">revision</span><span class="o">=</span><span class="s2">&quot;v0.2.0&quot;</span><span class="p">)</span>
+<span class="n">GptOssExperts</span> <span class="o">=</span> <span class="n">yamoe</span><span class="o">.</span><span class="n">vendored</span><span class="o">.</span><span class="n">gpt_oss_mlp</span><span class="o">.</span><span class="n">GptOssExperts</span>
+
+
+<span class="k">def</span><span class="w"> </span><span class="nf">gpt_oss_openai_moe</span><span class="p">(</span>
+    <span class="n">hidden_states</span><span class="p">,</span>
+    <span class="n">router_indices</span><span class="p">,</span>
+    <span class="n">routing_weights</span><span class="p">,</span>
+    <span class="n">gate_up_proj</span><span class="p">,</span>
+    <span class="n">gate_up_proj_bias</span><span class="p">,</span>
+    <span class="n">down_proj</span><span class="p">,</span>
+    <span class="n">down_proj_bias</span><span class="p">,</span>
+<span class="p">):</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    GptOssExperts reference implementation of OpenAI-style MoE.</span>
+<span class="sd">    This is the reference model implementation from the original GPT OSS codebase.</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+    <span class="n">B</span><span class="p">,</span> <span class="n">S</span><span class="p">,</span> <span class="n">H</span> <span class="o">=</span> <span class="n">hidden_states</span><span class="o">.</span><span class="n">shape</span>
+    <span class="n">E</span> <span class="o">=</span> <span class="n">routing_weights</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">2</span><span class="p">]</span>
+
+    <span class="c1"># Create a config object for GptOssExperts</span>
+    <span class="n">config</span> <span class="o">=</span> <span class="nb">type</span><span class="p">(</span><span class="s2">&quot;Config&quot;</span><span class="p">,</span> <span class="p">(),</span> <span class="p">{})()</span>
+    <span class="n">config</span><span class="o">.</span><span class="n">hidden_size</span> <span class="o">=</span> <span class="n">H</span>
+    <span class="n">config</span><span class="o">.</span><span class="n">intermediate_size</span> <span class="o">=</span> <span class="n">gate_up_proj</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">2</span><span class="p">]</span> <span class="o">//</span> <span class="mi">2</span>  <span class="c1"># expert_dim / 2 = H</span>
+    <span class="n">config</span><span class="o">.</span><span class="n">num_local_experts</span> <span class="o">=</span> <span class="n">E</span>
+
+    <span class="c1"># Initialize model</span>
+    <span class="n">model</span> <span class="o">=</span> <span class="n">GptOssExperts</span><span class="p">(</span><span class="n">config</span><span class="p">)</span>
+
+    <span class="c1"># Set weights from benchmark inputs</span>
+    <span class="n">model</span><span class="o">.</span><span class="n">gate_up_proj</span><span class="o">.</span><span class="n">data</span> <span class="o">=</span> <span class="n">gate_up_proj</span>
+    <span class="n">model</span><span class="o">.</span><span class="n">gate_up_proj_bias</span><span class="o">.</span><span class="n">data</span> <span class="o">=</span> <span class="n">gate_up_proj_bias</span>
+    <span class="n">model</span><span class="o">.</span><span class="n">down_proj</span><span class="o">.</span><span class="n">data</span> <span class="o">=</span> <span class="n">down_proj</span>
+    <span class="n">model</span><span class="o">.</span><span class="n">down_proj_bias</span><span class="o">.</span><span class="n">data</span> <span class="o">=</span> <span class="n">down_proj_bias</span>
+
+    <span class="n">model</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">hidden_states</span><span class="o">.</span><span class="n">device</span><span class="p">)</span>
+    <span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">()</span>
+
+    <span class="c1"># Force GptOssExperts to use CPU path for correctness (matches naive_moe_ref behavior)</span>
+    <span class="c1"># The GPU path processes all experts which can lead to numerical differences</span>
+    <span class="c1"># CPU path explicitly uses router_indices like the reference implementation</span>
+    <span class="n">model</span><span class="o">.</span><span class="n">train</span><span class="p">()</span>  <span class="c1"># Force CPU path</span>
+
+    <span class="c1"># Flatten routing_weights to [batch_seq, num_experts]</span>
+    <span class="n">routing_weights_flat</span> <span class="o">=</span> <span class="n">routing_weights</span><span class="o">.</span><span class="n">view</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="n">E</span><span class="p">)</span>
+
+    <span class="c1"># Run forward pass</span>
+    <span class="k">with</span> <span class="n">torch</span><span class="o">.</span><span class="n">no_grad</span><span class="p">():</span>
+        <span class="n">output</span> <span class="o">=</span> <span class="n">model</span><span class="p">(</span><span class="n">hidden_states</span><span class="p">,</span> <span class="n">router_indices</span><span class="p">,</span> <span class="n">routing_weights_flat</span><span class="p">)</span>
+
+    <span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">()</span>  <span class="c1"># Reset to eval mode</span>
+
+    <span class="k">return</span> <span class="n">output</span>
+
+
+<span class="n">run_benchmark</span><span class="p">(</span>
+    <span class="n">kernel_type</span><span class="o">=</span><span class="n">KernelTypeEnum</span><span class="o">.</span><span class="n">OPENAI_MOE</span><span class="p">,</span>
+    <span class="n">impl_name</span><span class="o">=</span><span class="s2">&quot;gpt_oss_experts&quot;</span><span class="p">,</span>
+    <span class="n">impl_tags</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;family&quot;</span><span class="p">:</span> <span class="s2">&quot;reference&quot;</span><span class="p">,</span> <span class="s2">&quot;backend&quot;</span><span class="p">:</span> <span class="s2">&quot;pytorch&quot;</span><span class="p">},</span>
+    <span class="n">impl_func</span><span class="o">=</span><span class="n">gpt_oss_openai_moe</span><span class="p">,</span>
+    <span class="n">dtype</span><span class="o">=</span><span class="s2">&quot;float32&quot;</span><span class="p">,</span>
+<span class="p">)</span>
+</pre></div>
+
+<div class="code-line-highlight" id="line-highlight-benchmark"></div>
+</div>
+</div>
+<div id="output-benchmark" class="cell-output">
+<div class="cell-stdout"><pre class="stdout-text">Running openai_moe benchmark on cuda with 8 workloads.
+
+======================================================================
+PROFILE TRACE: gpt_oss_experts | cuda_B1_S512_E2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                        gpt_oss_experts         0.00%       0.000us         0.00%       0.000us       0.000us      10.211ms       197.81%      10.211ms      10.211ms             1  
+                                        gpt_oss_experts        16.48%       2.023ms        99.94%      12.270ms      12.270ms       0.000us         0.00%       5.165ms       5.165ms             1  
+                                           aten::matmul         0.22%      26.489us         3.82%     468.520us      39.043us       0.000us         0.00%       4.540ms     378.357us            12  
+                                               aten::mm         2.36%     289.825us         3.60%     442.031us      36.836us       4.540ms        87.96%       4.540ms     378.357us            12  
+                                 ampere_sgemm_128x64_nn         0.00%       0.000us         0.00%       0.000us       0.000us       3.078ms        59.62%       3.078ms     341.948us             9  
+void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_128x64_8...         0.00%       0.000us         0.00%       0.000us       0.000us       1.457ms        28.23%       1.457ms     485.813us             3  
+                                              aten::mul         1.42%     174.948us         2.34%     287.701us      11.988us     109.119us         2.11%     109.119us       4.547us            24  
+                                              aten::add         1.61%     197.786us         3.85%     472.357us      26.242us     103.039us         2.00%     103.039us       5.724us            18  
+                                            aten::index         1.73%     212.127us         2.86%     350.900us      29.242us      86.591us         1.68%      86.591us       7.216us            12  
+                                       aten::index_add_         0.51%      62.499us         0.79%      97.312us      16.219us      82.688us         1.60%      82.688us      13.781us             6  
+void at::native::indexFuncLargeIndex&lt;float, long, un...         0.00%       0.000us         0.00%       0.000us       0.000us      82.688us         1.60%      82.688us      13.781us             6  
+void at::native::elementwise_kernel&lt;128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      80.511us         1.56%      80.511us       6.709us            12  
+                                          aten::nonzero         2.20%     270.146us         6.58%     808.380us      89.820us      63.743us         1.23%      74.368us       8.263us             9  
+                                            aten::clamp         0.98%     120.045us         1.63%     200.026us      16.669us      64.705us         1.25%      64.705us       5.392us            12  
+void at::native::elementwise_kernel&lt;128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      64.705us         1.25%      64.705us       5.392us            12  
+                                            aten::where         0.06%       7.400us         5.25%     644.007us     107.334us       0.000us         0.00%      60.384us      10.064us             6  
+                                    aten::nonzero_numpy         0.11%      13.320us         5.19%     636.607us     106.101us       0.000us         0.00%      60.384us      10.064us             6  
+void at::native::vectorized_gather_kernel&lt;16, long&gt;(...         0.00%       0.000us         0.00%       0.000us       0.000us      60.063us         1.16%      60.063us      10.011us             6  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      56.800us         1.10%      56.800us       4.733us            12  
+                         Memcpy DtoH (Device -&gt; Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us      50.911us         0.99%      50.911us       1.131us            45  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 12.278ms
+Self CUDA time total: 5.162ms
+
+
+
+======================================================================
+PROFILE TRACE: gpt_oss_experts | cuda_B1_S512_E4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                        gpt_oss_experts         0.00%       0.000us         0.00%       0.000us       0.000us      13.933ms       229.38%      13.933ms      13.933ms             1  
+                                        gpt_oss_experts        16.29%       2.560ms        99.97%      15.712ms      15.712ms       0.000us         0.00%       6.077ms       6.077ms             1  
+                                           aten::matmul         0.30%      47.223us         5.17%     812.581us      33.858us       0.000us         0.00%       5.268ms     219.512us            24  
+                                               aten::mm         3.09%     485.951us         4.87%     765.358us      31.890us       5.268ms        86.73%       5.268ms     219.512us            24  
+                                 ampere_sgemm_128x64_nn         0.00%       0.000us         0.00%       0.000us       0.000us       5.213ms        85.81%       5.213ms     217.198us            24  
+                                          aten::nonzero         2.45%     385.408us         7.89%       1.240ms      82.649us     112.163us         1.85%     134.498us       8.967us            15  
+                                              aten::mul         2.03%     318.275us         3.36%     528.222us      11.005us     130.496us         2.15%     130.496us       2.719us            48  
+                                              aten::add         2.25%     353.820us         3.74%     587.771us      16.327us     127.072us         2.09%     127.072us       3.530us            36  
+                                            aten::where         0.08%      11.882us         7.49%       1.177ms      98.080us       0.000us         0.00%     120.705us      10.059us            12  
+                                    aten::nonzero_numpy         0.15%      24.083us         7.41%       1.165ms      97.090us       0.000us         0.00%     120.705us      10.059us            12  
+                                            aten::index         2.31%     363.442us         3.93%     617.030us      25.710us     110.145us         1.81%     110.145us       4.589us            24  
+void at::native::elementwise_kernel&lt;128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     101.312us         1.67%     101.312us       4.221us            24  
+                         Memcpy DtoH (Device -&gt; Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us      91.447us         1.51%      91.447us       1.051us            87  
+                                            aten::clamp         1.32%     207.076us         2.26%     355.011us      14.792us      85.793us         1.41%      85.793us       3.575us            24  
+void at::native::elementwise_kernel&lt;128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      85.793us         1.41%      85.793us       3.575us            24  
+                                             aten::item         0.52%      81.620us        38.60%       6.066ms      84.255us       0.000us         0.00%      75.446us       1.048us            72  
+                              aten::_local_scalar_dense         2.00%     315.046us        38.08%       5.985ms      83.122us      75.446us         1.24%      75.446us       1.048us            72  
+                                       aten::index_add_         0.75%     118.511us         1.16%     182.084us      15.174us      72.926us         1.20%      72.926us       6.077us            12  
+void at::native::indexFuncLargeIndex&lt;float, long, un...         0.00%       0.000us         0.00%       0.000us       0.000us      72.926us         1.20%      72.926us       6.077us            12  
+void at::native::vectorized_gather_kernel&lt;16, long&gt;(...         0.00%       0.000us         0.00%       0.000us       0.000us      65.857us         1.08%      65.857us       5.488us            12  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 15.717ms
+Self CUDA time total: 6.074ms
+
+
+
+======================================================================
+PROFILE TRACE: gpt_oss_experts | cuda_B1_S1024_E2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                        gpt_oss_experts         0.00%       0.000us         0.00%       0.000us       0.000us      12.540ms       148.48%      12.540ms      12.540ms             1  
+                                        gpt_oss_experts        11.83%       1.734ms        99.96%      14.654ms      14.654ms       0.000us         0.00%       8.451ms       8.451ms             1  
+                                           aten::matmul         0.16%      23.602us         3.00%     439.592us      36.633us       0.000us         0.00%       7.417ms     618.087us            12  
+                                               aten::mm         1.78%     261.037us         2.84%     415.990us      34.666us       7.417ms        87.82%       7.417ms     618.087us            12  
+void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_256x128_...         0.00%       0.000us         0.00%       0.000us       0.000us       4.532ms        53.65%       4.532ms     755.263us             6  
+                                 ampere_sgemm_128x64_nn         0.00%       0.000us         0.00%       0.000us       0.000us       1.475ms        17.46%       1.475ms     491.509us             3  
+void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_128x64_8...         0.00%       0.000us         0.00%       0.000us       0.000us       1.405ms        16.64%       1.405ms     468.490us             3  
+                                              aten::mul         1.05%     153.262us         1.78%     261.173us      10.882us     197.791us         2.34%     197.791us       8.241us            24  
+                                              aten::add         1.26%     184.574us         2.07%     304.007us      16.889us     188.543us         2.23%     188.543us      10.475us            18  
+                                       aten::index_add_         0.35%      50.951us         0.57%      83.553us      13.925us     169.408us         2.01%     169.408us      28.235us             6  
+void at::native::indexFuncLargeIndex&lt;float, long, un...         0.00%       0.000us         0.00%       0.000us       0.000us     169.408us         2.01%     169.408us      28.235us             6  
+void at::native::elementwise_kernel&lt;128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     149.663us         1.77%     149.663us      12.472us            12  
+                                            aten::index         1.27%     186.102us         2.16%     316.927us      26.411us     146.942us         1.74%     146.942us      12.245us            12  
+void at::native::vectorized_gather_kernel&lt;16, long&gt;(...         0.00%       0.000us         0.00%       0.000us       0.000us     117.440us         1.39%     117.440us      19.573us             6  
+                                            aten::clamp         0.71%     104.743us         1.22%     178.924us      14.910us     110.912us         1.31%     110.912us       9.243us            12  
+void at::native::elementwise_kernel&lt;128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     110.912us         1.31%     110.912us       9.243us            12  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     104.864us         1.24%     104.864us       8.739us            12  
+                                          aten::nonzero         1.58%     232.211us         4.94%     724.348us      80.483us      69.633us         0.82%      81.377us       9.042us             9  
+                                            aten::where         0.04%       6.259us         4.08%     597.684us      99.614us       0.000us         0.00%      66.816us      11.136us             6  
+                                    aten::nonzero_numpy         0.08%      11.999us         4.03%     591.425us      98.571us       0.000us         0.00%      66.816us      11.136us             6  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 14.659ms
+Self CUDA time total: 8.446ms
+
+
+
+======================================================================
+PROFILE TRACE: gpt_oss_experts | cuda_B1_S1024_E4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                        gpt_oss_experts         0.00%       0.000us         0.00%       0.000us       0.000us      18.317ms       174.31%      18.317ms      18.317ms             1  
+                                        gpt_oss_experts        13.54%       2.761ms        99.97%      20.385ms      20.385ms       0.000us         0.00%      10.514ms      10.514ms             1  
+                                           aten::matmul         0.23%      47.082us         4.02%     819.853us      34.161us       0.000us         0.00%       9.237ms     384.865us            24  
+                                               aten::mm         2.37%     482.255us         3.79%     772.771us      32.199us       9.237ms        87.90%       9.237ms     384.865us            24  
+                                 ampere_sgemm_128x64_nn         0.00%       0.000us         0.00%       0.000us       0.000us       6.282ms        59.78%       6.282ms     349.001us            18  
+void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_128x64_8...         0.00%       0.000us         0.00%       0.000us       0.000us       2.944ms        28.01%       2.944ms     490.655us             6  
+                                              aten::mul         1.50%     305.331us         2.55%     520.818us      10.850us     235.298us         2.24%     235.298us       4.902us            48  
+                                              aten::add         1.72%     351.113us         2.86%     584.036us      16.223us     213.502us         2.03%     213.502us       5.931us            36  
+                                            aten::index         1.95%     397.314us         3.28%     668.454us      27.852us     205.349us         1.95%     205.349us       8.556us            24  
+void at::native::elementwise_kernel&lt;128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     166.720us         1.59%     166.720us       6.947us            24  
+                                       aten::index_add_         0.50%     101.340us         0.81%     165.573us      13.798us     155.585us         1.48%     155.585us      12.965us            12  
+void at::native::indexFuncLargeIndex&lt;float, long, un...         0.00%       0.000us         0.00%       0.000us       0.000us     155.585us         1.48%     155.585us      12.965us            12  
+void at::native::vectorized_gather_kernel&lt;16, long&gt;(...         0.00%       0.000us         0.00%       0.000us       0.000us     146.947us         1.40%     146.947us      12.246us            12  
+                                          aten::nonzero         1.95%     398.176us         6.26%       1.276ms      85.090us     121.380us         1.16%     145.668us       9.711us            15  
+                                            aten::clamp         1.04%     212.193us         1.79%     365.180us      15.216us     134.239us         1.28%     134.239us       5.593us            24  
+void at::native::elementwise_kernel&lt;128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     134.239us         1.28%     134.239us       5.593us            24  
+                                            aten::where         0.06%      11.340us         5.97%       1.216ms     101.373us       0.000us         0.00%     131.522us      10.960us            12  
+                                    aten::nonzero_numpy         0.12%      24.140us         5.91%       1.205ms     100.428us       0.000us         0.00%     131.522us      10.960us            12  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     119.840us         1.14%     119.840us       4.993us            24  
+                         Memcpy DtoH (Device -&gt; Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us     100.830us         0.96%     100.830us       1.159us            87  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 20.390ms
+Self CUDA time total: 10.509ms
+
+
+
+======================================================================
+PROFILE TRACE: gpt_oss_experts | cuda_B4_S512_E2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                        gpt_oss_experts         0.00%       0.000us         0.00%       0.000us       0.000us      21.031ms       119.92%      21.031ms      21.031ms             1  
+                                        gpt_oss_experts         7.59%       1.747ms        99.98%      23.024ms      23.024ms       0.000us         0.00%      17.548ms      17.548ms             1  
+                                           aten::matmul         0.10%      23.660us         1.94%     446.020us      37.168us       0.000us         0.00%      14.659ms       1.222ms            12  
+                                               aten::mm         1.17%     268.524us         1.83%     422.360us      35.197us      14.659ms        83.59%      14.659ms       1.222ms            12  
+void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_256x128_...         0.00%       0.000us         0.00%       0.000us       0.000us       8.967ms        51.13%       8.967ms       1.495ms             6  
+                                 ampere_sgemm_128x64_nn         0.00%       0.000us         0.00%       0.000us       0.000us       5.685ms        32.42%       5.685ms     947.562us             6  
+                                              aten::add         0.82%     187.722us         1.36%     312.616us      17.368us     785.408us         4.48%     785.408us      43.634us            18  
+                                              aten::mul         0.68%     156.369us         1.15%     264.222us      11.009us     674.688us         3.85%     674.688us      28.112us            24  
+void at::native::elementwise_kernel&lt;128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     504.575us         2.88%     504.575us      42.048us            12  
+                                       aten::index_add_         0.22%      50.951us         0.37%      86.132us      14.355us     448.545us         2.56%     448.545us      74.757us             6  
+void at::native::indexFuncLargeIndex&lt;float, long, un...         0.00%       0.000us         0.00%       0.000us       0.000us     448.545us         2.56%     448.545us      74.757us             6  
+                                            aten::clamp         0.46%     107.053us         0.80%     183.295us      15.275us     336.000us         1.92%     336.000us      28.000us            12  
+void at::native::elementwise_kernel&lt;128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     336.000us         1.92%     336.000us      28.000us            12  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     314.239us         1.79%     314.239us      52.373us             6  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     280.833us         1.60%     280.833us      46.806us             6  
+                                            aten::index         0.81%     185.806us         1.39%     320.548us      26.712us     259.102us         1.48%     259.102us      21.592us            12  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     258.944us         1.48%     258.944us      21.579us            12  
+void at::native::vectorized_gather_kernel&lt;16, long&gt;(...         0.00%       0.000us         0.00%       0.000us       0.000us     225.407us         1.29%     225.407us      37.568us             6  
+                                          aten::sigmoid         0.16%      36.131us         0.27%      61.901us      10.317us     175.073us         1.00%     175.073us      29.179us             6  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     175.073us         1.00%     175.073us      29.179us             6  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 23.030ms
+Self CUDA time total: 17.537ms
+
+
+
+======================================================================
+PROFILE TRACE: gpt_oss_experts | cuda_B4_S512_E4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                        gpt_oss_experts         0.00%       0.000us         0.00%       0.000us       0.000us      24.377ms       140.11%      24.377ms      24.377ms             1  
+                                        gpt_oss_experts        10.50%       2.651ms        99.98%      25.237ms      25.237ms       0.000us         0.00%      17.408ms      17.408ms             1  
+                                           aten::matmul         0.19%      47.519us         3.41%     860.801us      35.867us       0.000us         0.00%      15.185ms     632.705us            24  
+                                               aten::mm         2.06%     521.061us         3.22%     813.282us      33.887us      15.185ms        87.28%      15.185ms     632.705us            24  
+void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_256x128_...         0.00%       0.000us         0.00%       0.000us       0.000us       9.179ms        52.76%       9.179ms     764.922us            12  
+                                 ampere_sgemm_128x64_nn         0.00%       0.000us         0.00%       0.000us       0.000us       3.124ms        17.96%       3.124ms     520.682us             6  
+void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_128x64_8...         0.00%       0.000us         0.00%       0.000us       0.000us       2.871ms        16.50%       2.871ms     478.432us             6  
+                                              aten::add         1.42%     359.495us         2.37%     598.003us      16.611us     427.713us         2.46%     427.713us      11.881us            36  
+                                              aten::mul         1.23%     309.946us         2.09%     527.073us      10.981us     420.510us         2.42%     420.510us       8.761us            48  
+                                       aten::index_add_         0.40%     101.283us         0.66%     166.886us      13.907us     383.489us         2.20%     383.489us      31.957us            12  
+void at::native::indexFuncLargeIndex&lt;float, long, un...         0.00%       0.000us         0.00%       0.000us       0.000us     383.489us         2.20%     383.489us      31.957us            12  
+void at::native::elementwise_kernel&lt;128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     343.712us         1.98%     343.712us      14.321us            24  
+                                            aten::index         1.56%     393.991us         2.62%     662.158us      27.590us     337.086us         1.94%     337.086us      14.045us            24  
+void at::native::vectorized_gather_kernel&lt;16, long&gt;(...         0.00%       0.000us         0.00%       0.000us       0.000us     272.926us         1.57%     272.926us      22.744us            12  
+                                            aten::clamp         0.84%     212.993us         1.44%     363.038us      15.127us     230.431us         1.32%     230.431us       9.601us            24  
+void at::native::elementwise_kernel&lt;128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     230.431us         1.32%     230.431us       9.601us            24  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     223.071us         1.28%     223.071us       9.295us            24  
+                                          aten::nonzero         1.57%     395.401us         5.00%       1.262ms      84.127us     128.836us         0.74%     156.164us      10.411us            15  
+                                            aten::where         0.05%      12.011us         4.77%       1.205ms     100.378us       0.000us         0.00%     140.900us      11.742us            12  
+                                    aten::nonzero_numpy         0.10%      25.021us         4.72%       1.193ms      99.377us       0.000us         0.00%     140.900us      11.742us            12  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 25.242ms
+Self CUDA time total: 17.398ms
+
+
+
+======================================================================
+PROFILE TRACE: gpt_oss_experts | cuda_B4_S1024_E2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                        gpt_oss_experts         0.00%       0.000us         0.00%       0.000us       0.000us      40.556ms       109.47%      40.556ms      40.556ms             1  
+                                        gpt_oss_experts         4.33%       1.794ms        99.85%      41.353ms      41.353ms       0.000us         0.00%      37.080ms      37.080ms             1  
+                                           aten::matmul         0.06%      24.371us         1.08%     445.903us      37.159us       0.000us         0.00%      27.082ms       2.257ms            12  
+                                               aten::mm         0.70%     291.738us         1.02%     421.532us      35.128us      27.082ms        73.10%      27.082ms       2.257ms            12  
+void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_256x128_...         0.00%       0.000us         0.00%       0.000us       0.000us      27.079ms        73.09%      27.079ms       2.257ms            12  
+                                              aten::mul         0.38%     159.199us         0.65%     268.178us      11.174us       2.983ms         8.05%       2.983ms     124.287us            24  
+                                              aten::add         0.48%     198.424us         1.09%     451.763us      25.098us       2.404ms         6.49%       2.404ms     133.559us            18  
+                                            aten::clamp         0.27%     112.290us         0.46%     189.433us      15.786us       2.392ms         6.46%       2.392ms     199.373us            12  
+void at::native::elementwise_kernel&lt;128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       2.392ms         6.46%       2.392ms     199.373us            12  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       1.988ms         5.37%       1.988ms     165.669us            12  
+void at::native::elementwise_kernel&lt;128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       1.629ms         4.40%       1.629ms     135.763us            12  
+                                       aten::index_add_         0.12%      50.103us         0.20%      84.453us      14.076us     899.456us         2.43%     899.456us     149.909us             6  
+void at::native::indexFuncLargeIndex&lt;float, long, un...         0.00%       0.000us         0.00%       0.000us       0.000us     899.456us         2.43%     899.456us     149.909us             6  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     774.912us         2.09%     774.912us     129.152us             6  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     733.217us         1.98%     733.217us     122.203us             6  
+                                            aten::index         0.45%     187.302us         0.77%     318.787us      26.566us     712.767us         1.92%     712.767us      59.397us            12  
+void at::native::vectorized_gather_kernel&lt;16, long&gt;(...         0.00%       0.000us         0.00%       0.000us       0.000us     678.496us         1.83%     678.496us     113.083us             6  
+                                          aten::sigmoid         0.09%      36.082us         0.15%      63.023us      10.504us     323.008us         0.87%     323.008us      53.835us             6  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     323.008us         0.87%     323.008us      53.835us             6  
+void at::native::elementwise_kernel&lt;128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     261.631us         0.71%     261.631us      43.605us             6  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 41.415ms
+Self CUDA time total: 37.046ms
+
+
+
+======================================================================
+PROFILE TRACE: gpt_oss_experts | cuda_B4_S1024_E4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                        gpt_oss_experts         0.00%       0.000us         0.00%       0.000us       0.000us      41.050ms       117.27%      41.050ms      41.050ms             1  
+                                        gpt_oss_experts         6.46%       2.709ms        99.99%      41.912ms      41.912ms       0.000us         0.00%      35.025ms      35.025ms             1  
+                                           aten::matmul         0.11%      47.590us         2.12%     888.873us      37.036us       0.000us         0.00%      29.051ms       1.210ms            24  
+                                               aten::mm         1.28%     536.727us         2.01%     841.283us      35.053us      29.051ms        82.99%      29.051ms       1.210ms            24  
+void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_256x128_...         0.00%       0.000us         0.00%       0.000us       0.000us      20.585ms        58.81%      20.585ms       1.372ms            15  
+                                 ampere_sgemm_128x64_nn         0.00%       0.000us         0.00%       0.000us       0.000us       8.453ms        24.15%       8.453ms     939.204us             9  
+                                              aten::add         0.88%     367.610us         1.45%     609.056us      16.918us       1.486ms         4.24%       1.486ms      41.264us            36  
+                                              aten::mul         0.74%     309.128us         1.24%     518.283us      10.798us       1.380ms         3.94%       1.380ms      28.757us            48  
+void at::native::elementwise_kernel&lt;128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     925.695us         2.64%     925.695us      38.571us            24  
+                                       aten::index_add_         0.24%      99.111us         0.40%     167.273us      13.939us     903.487us         2.58%     903.487us      75.291us            12  
+void at::native::indexFuncLargeIndex&lt;float, long, un...         0.00%       0.000us         0.00%       0.000us       0.000us     903.487us         2.58%     903.487us      75.291us            12  
+                                            aten::clamp         0.51%     214.986us         0.87%     364.790us      15.200us     775.806us         2.22%     775.806us      32.325us            24  
+void at::native::elementwise_kernel&lt;128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     775.806us         2.22%     775.806us      32.325us            24  
+                                            aten::index         0.89%     373.269us         1.50%     629.207us      26.217us     670.881us         1.92%     670.881us      27.953us            24  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     631.200us         1.80%     631.200us      52.600us            12  
+void at::native::vectorized_gather_kernel&lt;16, long&gt;(...         0.00%       0.000us         0.00%       0.000us       0.000us     600.224us         1.71%     600.224us      50.019us            12  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     559.808us         1.60%     559.808us      46.651us            12  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     540.611us         1.54%     540.611us      22.525us            24  
+                                          aten::sigmoid         0.17%      72.182us         0.29%     123.582us      10.298us     351.039us         1.00%     351.039us      29.253us            12  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     351.039us         1.00%     351.039us      29.253us            12  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 41.917ms
+Self CUDA time total: 35.005ms
+
+
+impl                     wl                  p50(ms)  ok
+gpt_oss_experts          cuda_B1_S1024_E2       3.79  True
+gpt_oss_experts          cuda_B1_S1024_E4       5.24  True
+gpt_oss_experts          cuda_B1_S512_E2        2.63  True
+gpt_oss_experts          cuda_B1_S512_E4        3.89  True
+gpt_oss_experts          cuda_B4_S1024_E2      13.28  True
+gpt_oss_experts          cuda_B4_S1024_E4      13.19  True
+gpt_oss_experts          cuda_B4_S512_E2        6.74  True
+gpt_oss_experts          cuda_B4_S512_E4        7.36  True
+</pre></div>
+<div class="uv-install-logs" id="uv-logs-benchmark">
+<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
+<div class="uv-logs-content" style="display: none;">
+   Updating https://github.com/huggingface/kernels.git (HEAD)
+    Updated https://github.com/huggingface/kernels.git (10753bdcb99934587f52ff7fcb13cf5d23ab7b98)
+   Building kernels @ git+https://github.com/huggingface/kernels.git@10753bdcb99934587f52ff7fcb13cf5d23ab7b98
+      Built kernels @ git+https://github.com/huggingface/kernels.git@10753bdcb99934587f52ff7fcb13cf5d23ab7b98
+Installed 52 packages in 205ms
+</div>
+</div>
+<div class="cell-stderr">Fetching 6 files:   0%|          | 0/6 [00:00&lt;?, ?it/s]
+Fetching 6 files:  33%|███▎      | 2/6 [00:00&lt;00:00, 16.13it/s]
+Fetching 6 files:  67%|██████▋   | 4/6 [00:00&lt;00:00,  7.33it/s]
+Fetching 6 files: 100%|██████████| 6/6 [00:00&lt;00:00, 11.97it/s]</div>
+<div class="cell-artifacts">
+<h4>Artifacts:</h4>
+<a href="artifacts/benchmark/openai_moe.jsonl" class="artifact" target="_blank">openai_moe.jsonl</a>
+</div>
+</div>
+</div>
+    </div>
+    
+</body>
+</html>
\ No newline at end of file
diff --git a/openai_moe/impls/index.html b/openai_moe/impls/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..a032fd6256daec3a2b89ade46bc2b05f2a12fbf3
--- /dev/null
+++ b/openai_moe/impls/index.html
@@ -0,0 +1,89 @@
+<!DOCTYPE html>
+<html>
+<head>
+  <meta charset='UTF-8'>
+  <meta name='viewport' content='width=device-width, initial-scale=1.0'>
+  <title>Index of /openai_moe/impls</title>
+  <style>
+    :root {
+      --bg-primary: #0a0a0a;
+      --bg-secondary: #121212;
+      --bg-tertiary: #181818;
+      --text-primary: #e0e0e0;
+      --text-secondary: #888888;
+      --text-link: #64b5f6;
+      --border-primary: #2a2a2a;
+    }
+    body {
+      font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
+      background: var(--bg-primary);
+      color: var(--text-primary);
+      margin: 0;
+      padding: 16px;
+      max-width: 900px;
+      margin: 0 auto;
+    }
+    .controls {
+      display: flex;
+      justify-content: flex-end;
+      margin-bottom: 1rem;
+    }
+    .back-button {
+      background: var(--bg-secondary);
+      border: 1px solid var(--border-primary);
+      padding: 8px 12px;
+      border-radius: 4px;
+      color: var(--text-secondary);
+      cursor: pointer;
+      font-size: 0.9rem;
+      text-decoration: none;
+      display: inline-block;
+    }
+    .back-button:hover {
+      color: var(--text-primary);
+      background: var(--bg-tertiary);
+    }
+    h1 {
+      font-size: 1.5em;
+      margin: 1rem 0;
+      color: var(--text-primary);
+      border-bottom: 1px solid var(--border-primary);
+      padding-bottom: 0.5rem;
+    }
+    ul {
+      list-style-type: none;
+      padding: 0;
+    }
+    li {
+      margin: 0;
+      border-bottom: 1px solid var(--border-primary);
+    }
+    li:last-child {
+      border-bottom: none;
+    }
+    a {
+      display: block;
+      padding: 0.75rem 0.5rem;
+      text-decoration: none;
+      color: var(--text-link);
+      transition: background 0.2s ease;
+    }
+    a:hover {
+      background: var(--bg-secondary);
+    }
+    .dir {
+      font-weight: 500;
+    }
+  </style>
+</head>
+<body>
+  <div class='controls'>
+    <a href='../index.html' class='back-button'>← back</a>
+  </div>
+  <h1>Index of /openai_moe/impls</h1>
+  <ul>
+    <li><a href='binned_torch.html' class='file'>binned_torch.html</a></li>
+    <li><a href='gpt_oss_moe.html' class='file'>gpt_oss_moe.html</a></li>
+  </ul>
+</body>
+</html>
\ No newline at end of file
diff --git a/openai_moe/index.html b/openai_moe/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..aa8352a8b25e9434df4f5e5d95c60283730bb0ee
--- /dev/null
+++ b/openai_moe/index.html
@@ -0,0 +1,89 @@
+<!DOCTYPE html>
+<html>
+<head>
+  <meta charset='UTF-8'>
+  <meta name='viewport' content='width=device-width, initial-scale=1.0'>
+  <title>Index of /openai_moe</title>
+  <style>
+    :root {
+      --bg-primary: #0a0a0a;
+      --bg-secondary: #121212;
+      --bg-tertiary: #181818;
+      --text-primary: #e0e0e0;
+      --text-secondary: #888888;
+      --text-link: #64b5f6;
+      --border-primary: #2a2a2a;
+    }
+    body {
+      font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
+      background: var(--bg-primary);
+      color: var(--text-primary);
+      margin: 0;
+      padding: 16px;
+      max-width: 900px;
+      margin: 0 auto;
+    }
+    .controls {
+      display: flex;
+      justify-content: flex-end;
+      margin-bottom: 1rem;
+    }
+    .back-button {
+      background: var(--bg-secondary);
+      border: 1px solid var(--border-primary);
+      padding: 8px 12px;
+      border-radius: 4px;
+      color: var(--text-secondary);
+      cursor: pointer;
+      font-size: 0.9rem;
+      text-decoration: none;
+      display: inline-block;
+    }
+    .back-button:hover {
+      color: var(--text-primary);
+      background: var(--bg-tertiary);
+    }
+    h1 {
+      font-size: 1.5em;
+      margin: 1rem 0;
+      color: var(--text-primary);
+      border-bottom: 1px solid var(--border-primary);
+      padding-bottom: 0.5rem;
+    }
+    ul {
+      list-style-type: none;
+      padding: 0;
+    }
+    li {
+      margin: 0;
+      border-bottom: 1px solid var(--border-primary);
+    }
+    li:last-child {
+      border-bottom: none;
+    }
+    a {
+      display: block;
+      padding: 0.75rem 0.5rem;
+      text-decoration: none;
+      color: var(--text-link);
+      transition: background 0.2s ease;
+    }
+    a:hover {
+      background: var(--bg-secondary);
+    }
+    .dir {
+      font-weight: 500;
+    }
+  </style>
+</head>
+<body>
+  <div class='controls'>
+    <a href='../index.html' class='back-button'>← back</a>
+  </div>
+  <h1>Index of /openai_moe</h1>
+  <ul>
+    <li><a href='impls/index.html' class='dir'>impls/</a></li>
+    <li><a href='results/index.html' class='dir'>results/</a></li>
+  </ul>
+</body>
+</html>
\ No newline at end of file
diff --git a/openai_moe/results/artifacts/combine/latency.svg b/openai_moe/results/artifacts/combine/latency.svg
new file mode 100644
index 0000000000000000000000000000000000000000..10dbc66aeb1ffe85716a2da3bc2a8a2ad4700bc3
--- /dev/null
+++ b/openai_moe/results/artifacts/combine/latency.svg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b6b68c91c95cfb46a71083a3812949c831a6e82a5f655eb32ed7c0b19426124d
+size 21857
diff --git a/openai_moe/results/cells/combine.py b/openai_moe/results/cells/combine.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca2ef03a01cd0a7f3bc24a5646d108265d436bf2
--- /dev/null
+++ b/openai_moe/results/cells/combine.py
@@ -0,0 +1,27 @@
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "numpy",
+#     "torch==2.8.0",
+#     "kernels-benchmark-tools",
+#     "matplotlib",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+from kernels_benchmark_tools.core.visuals import generate_combined_results
+
+# Map display names to uvnote environment variables
+cache_env_map = {
+    # "PyTorch OpenAI MoE": "UVNOTE_FILE_TORCH_OPENAI_MOE_BENCHMARK",
+    "Binned PyTorch": "UVNOTE_FILE_BINNED_TORCH_BENCHMARK",
+    "GptOssExperts": "UVNOTE_FILE_GPT_OSS_MOE_BENCHMARK",
+}
+
+# Generate combined results with visualization
+generate_combined_results(
+    cache_env_map=cache_env_map,
+    output_filename="openai_moe.jsonl",
+    svg_filename="latency.svg"
+)
\ No newline at end of file
diff --git a/openai_moe/results/combined_results.html b/openai_moe/results/combined_results.html
new file mode 100644
index 0000000000000000000000000000000000000000..bedcc83cf8db7d27f0e74cfbb9d1c9ceb5663901
--- /dev/null
+++ b/openai_moe/results/combined_results.html
@@ -0,0 +1,4935 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>OpenAI-style MoE Benchmark - Combined Results</title>
+
+    <link rel="preconnect" href="https://fonts.googleapis.com">
+    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+    <link href="https://fonts.googleapis.com/css2?family=JetBrains+Mono:ital,wght@0,100..800;1,100..800&display=swap" rel="stylesheet">
+
+    <script>
+// Iframe-friendly navigation router
+        (function() {
+            const isIframe = window.self !== window.top;
+            if (!isIframe) return; // Only activate in iframe context
+
+            // On load: if hash points to a different page, navigate there
+            const hash = window.location.hash;
+            if (hash && hash.startsWith('#/')) {
+                const targetPath = hash.slice(2); // Remove '#/'
+                const currentPath = window.location.pathname.split('/').pop();
+
+                // Only navigate if we're not already on the target page
+                if (targetPath !== currentPath) {
+                    window.location.href = targetPath;
+                    return; // Stop execution, we're navigating away
+                }
+            }
+
+            // Intercept all link clicks for hash-based navigation
+            document.addEventListener('click', function(e) {
+                const link = e.target.closest('a');
+                if (!link) return;
+
+                const href = link.getAttribute('href');
+
+                // Skip external links, anchors, and javascript: links
+                if (!href || href.startsWith('#') || href.startsWith('http') || href.startsWith('javascript:')) {
+                    return;
+                }
+
+                e.preventDefault();
+
+                // Convert relative/absolute path to hash-based navigation
+                const url = new URL(href, window.location.href);
+                let fullPath = url.pathname;
+
+                // Remove leading slash if present for cleaner paths
+                if (fullPath.startsWith('/')) {
+                    fullPath = fullPath.slice(1);
+                }
+
+                // Update parent URL hash
+                window.location.hash = '#/' + fullPath;
+
+                // For HTML files, navigate within iframe
+                if (fullPath.endsWith('.html') || fullPath.endsWith('/')) {
+                    const pathParts = fullPath.split('/').filter(p => p);
+                    const targetFile = pathParts[pathParts.length - 1] || 'index.html';
+                    window.location.href = targetFile;
+                } else {
+                    // For non-HTML files (raw .py, etc), open directly
+                    window.open(href, '_blank');
+                }
+            });
+        })();
+
+        // Apply theme and widget visibility immediately to prevent flicker
+        (function() {
+            const configTheme = 'dark';
+            const hasConfigUi = false;
+            const configUi = hasConfigUi ? 'None' : null;
+            const hasWidgetsConfig = false;
+            const widgetsOn = hasWidgetsConfig ? false : true;
+            let theme;
+            if (configTheme === 'auto') {
+                theme = window.matchMedia('(prefers-color-scheme: dark)').matches ? 'dark' : 'light';
+            } else {
+                theme = localStorage.getItem('uvnote-theme') || configTheme;
+            }
+            document.documentElement.setAttribute('data-theme', theme);
+
+            // Initialize UI theme (css theme)
+            let ui = hasConfigUi ? configUi : (localStorage.getItem('uvnote-ui') || 'default');
+            if (ui !== 'default' && ui !== 'none' && ui !== 'monocolor') { ui = 'default'; }
+            document.documentElement.setAttribute('data-ui', ui);
+
+            // Apply widgets visibility
+            document.documentElement.setAttribute('data-widgets', widgetsOn ? 'on' : 'off');
+        })();
+    </script>
+    <style>
+:root[data-theme="light"] {
+    --bg-primary: #ffffff;
+    --bg-secondary: #f6f8fa;
+    --bg-tertiary: #f8f9fa;
+    --bg-code: #f8f9fa;
+    --bg-error: #fdf2f2;
+    --bg-artifact: #e6f3ff;
+    --bg-artifact-hover: #d0e7ff;
+
+    --text-primary: #333;
+    --text-secondary: #656d76;
+    --text-error: #c53030;
+    --text-link: #0969da;
+
+    --border-primary: #e1e5e9;
+    --border-error: #e53e3e;
+    --border-cell-failed: #d73a49;
+
+    --shadow: rgba(0, 0, 0, 0.1);
+}
+
+:root[data-theme="dark"] {
+    --bg-primary: #0a0a0a;
+    --bg-secondary: #121212;
+    --bg-tertiary: #181818;
+    --bg-code: #0d0d0d;
+    --bg-error: #1a0f0f;
+    --bg-artifact: #151515;
+    --bg-artifact-hover: #1a1a1a;
+
+    --text-primary: #e0e0e0;
+    --text-secondary: #888888;
+    --text-error: #ff6b6b;
+    --text-link: #64b5f6;
+
+    --border-primary: #2a2a2a;
+    --border-error: #ff6b6b;
+    --border-cell-failed: #ff6b6b;
+
+    --shadow: rgba(255, 255, 255, 0.05);
+}
+
+/* Monocolor UI theme: black/white background, all text/borders single blue */
+:root[data-ui="monocolor"] {
+    --mono-color: #0a66ff;
+}
+
+:root[data-ui="monocolor"][data-theme="light"] {
+    --bg-primary: #ffffff;
+}
+
+:root[data-ui="monocolor"][data-theme="dark"] {
+    --bg-primary: #000000;
+}
+
+:root[data-ui="monocolor"] {
+    --bg-secondary: var(--bg-primary);
+    --bg-tertiary: var(--bg-primary);
+    --bg-code: var(--bg-primary);
+    --bg-error: var(--bg-primary);
+    --bg-artifact: var(--bg-primary);
+    --bg-artifact-hover: var(--bg-primary);
+
+    --text-primary: var(--mono-color);
+    --text-secondary: var(--mono-color);
+    --text-error: var(--mono-color);
+    --text-link: var(--mono-color);
+
+    --border-primary: var(--mono-color);
+    --border-error: var(--mono-color);
+    --border-cell-failed: var(--mono-color);
+
+    --shadow: none;
+}
+
+:root[data-ui="monocolor"] a {
+    color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .menu-button,
+:root[data-ui="monocolor"] .theme-toggle,
+:root[data-ui="monocolor"] .reset-toggle,
+:root[data-ui="monocolor"] .back-button {
+    background: var(--bg-primary);
+    color: var(--mono-color);
+    border-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .menu-button:hover,
+:root[data-ui="monocolor"] .theme-toggle:hover,
+:root[data-ui="monocolor"] .reset-toggle:hover,
+:root[data-ui="monocolor"] .back-button:hover {
+    background: var(--bg-primary);
+    color: var(--mono-color);
+    border-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .menu-dropdown {
+    background: var(--bg-primary);
+    border-color: var(--mono-color);
+    box-shadow: none;
+}
+
+:root[data-ui="monocolor"] .menu-item {
+    color: var(--mono-color);
+    border-bottom-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .system-info {
+    background: var(--bg-primary);
+    border-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .cell {
+    border-color: var(--mono-color);
+    background: var(--bg-primary);
+}
+
+:root[data-ui="monocolor"] .cell-header {
+    background: var(--bg-primary);
+    border-bottom-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .artifact {
+    background: var(--bg-primary);
+    border-color: var(--mono-color);
+    color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .artifact:hover {
+    background: var(--bg-primary);
+}
+
+:root[data-ui="monocolor"] .artifact-preview img,
+:root[data-ui="monocolor"] .artifact-preview svg {
+    border-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .status-widget {
+    background: var(--bg-primary);
+    border-color: var(--mono-color);
+    color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .minimap,
+:root[data-ui="monocolor"] .file-explorer,
+:root[data-ui="monocolor"] .tools-widget {
+    background: var(--bg-primary);
+    border-color: var(--mono-color);
+    color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .cell-code {
+    background: var(--bg-primary);
+    border-bottom-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .tools-title,
+:root[data-ui="monocolor"] .file-explorer-section-title,
+:root[data-ui="monocolor"] .minimap-title {
+    color: var(--mono-color);
+    border-bottom-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .tool-button {
+    background: var(--bg-primary);
+    border-color: var(--mono-color);
+    color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .tool-button.active {
+    border-color: var(--mono-color);
+}
+
+:root[data-ui="monocolor"] .file-explorer-item,
+:root[data-ui="monocolor"] .minimap-item {
+    color: var(--mono-color);
+}
+
+/* Force Pygments code to mono blue on mono bg */
+:root[data-ui="monocolor"] .highlight {
+    background: var(--bg-primary) !important;
+    color: var(--mono-color) !important;
+}
+
+:root[data-ui="monocolor"] .highlight *,
+:root[data-ui="monocolor"] .highlight .hll {
+    color: var(--mono-color) !important;
+    background: transparent !important;
+    border-color: var(--mono-color) !important;
+}
+
+/* Default code font + metrics (overridable via frontmatter) */
+:root {
+    --code-font-size: 0.95rem;
+    --code-line-height: 1.5;
+    --code-pad-y: 0.75rem;
+}
+
+/* Minimal UI theme overrides base variables for a flatter, 90s look */
+:root[data-ui="none"] {
+    --bg-primary: #ffffff;
+    --bg-secondary: transparent;
+    --bg-tertiary: transparent;
+    --bg-code: #f9f9f9;
+    --bg-error: #fff0f0;
+    --bg-artifact: #f0f7ff;
+    --bg-artifact-hover: #e5f1ff;
+
+    --text-primary: #000000;
+    --text-secondary: #222222;
+    --text-error: #a00000;
+    --text-link: #0000ee;
+
+    --border-primary: #cccccc;
+    --border-error: #cc0000;
+    --border-cell-failed: #cc0000;
+
+    --shadow: none;
+}
+
+html {
+    overscroll-behavior: none;
+}
+
+body {
+    font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace;
+    line-height: 1.4;
+    max-width: 1000px;
+    margin: 0 auto;
+    padding: 15px;
+    color: var(--text-primary);
+    background: var(--bg-primary);
+    transition: background-color 0.2s ease, color 0.2s ease;
+    overscroll-behavior: none;
+}
+
+/* Minimal "none" UI theme overrides */
+:root[data-ui="none"] body {
+    font-family: 'Times New Roman', Times, serif;
+    line-height: 1.5;
+    max-width: 860px;
+    padding: 12px;
+    background: #ffffff;
+    color: #000000;
+    transition: none;
+}
+
+/* Two panel layout removed */
+
+.controls {
+    position: fixed;
+    top: 20px;
+    right: 20px;
+    display: flex;
+    flex-direction: column;
+    align-items: flex-end;
+    gap: 0.25rem;
+    z-index: 1000;
+}
+
+.controls-buttons {
+    display: flex;
+    gap: 0.5rem;
+}
+
+.menu-button {
+    position: relative;
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    padding: 8px 12px;
+    border-radius: 2px;
+    color: var(--text-secondary);
+    cursor: pointer;
+    font-family: inherit;
+    font-size: 0.9rem;
+    user-select: none;
+}
+
+/* Keep default control styling when widgets are enabled, even in minimal UI */
+:root[data-ui="none"][data-widgets="on"] .menu-button,
+:root[data-ui="none"][data-widgets="on"] .theme-toggle,
+:root[data-ui="none"][data-widgets="on"] .reset-toggle,
+:root[data-ui="none"][data-widgets="on"] .back-button {
+    background: #f6f6f6;
+    border: 1px solid #cccccc;
+    color: #222222;
+}
+
+.menu-button:hover {
+    color: var(--text-primary);
+    background: var(--bg-tertiary);
+}
+
+/* Controls state indicator (top-right) */
+/* Status widget (bottom-right) */
+.status-widget {
+    position: fixed;
+    right: 20px;
+    bottom: 20px;
+    width: auto;
+    max-width: 260px;
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    border-radius: 2px;
+    padding: 6px 8px;
+    font-size: 0.8rem;
+    color: var(--text-secondary);
+    z-index: 100;
+}
+
+.status-widget strong {
+    color: var(--text-primary);
+}
+
+:root[data-ui="none"][data-widgets="on"] .status-widget {
+    background: #f6f6f6;
+    border-color: #ccc;
+    color: #222;
+}
+
+:root[data-ui="none"][data-widgets="on"] .menu-button:hover,
+:root[data-ui="none"][data-widgets="on"] .theme-toggle:hover,
+:root[data-ui="none"][data-widgets="on"] .reset-toggle:hover,
+:root[data-ui="none"][data-widgets="on"] .back-button:hover {
+    background: #ededed;
+    border-color: #bbbbbb;
+    color: #000000;
+}
+
+.menu-dropdown {
+    position: absolute;
+    top: 100%;
+    right: 0;
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    border-radius: 4px;
+    box-shadow: 0 4px 12px var(--shadow);
+    min-width: 160px;
+    opacity: 0;
+    visibility: hidden;
+    transform: translateY(-8px);
+    transition: all 0.2s ease;
+    z-index: 1001;
+    margin-top: 4px;
+}
+
+:root[data-ui="none"][data-widgets="on"] .menu-dropdown {
+    background: #ffffff;
+    border: 1px solid #cccccc;
+    box-shadow: none;
+}
+
+.menu-button.active .menu-dropdown {
+    opacity: 1;
+    visibility: visible;
+    transform: translateY(0);
+}
+
+.menu-item {
+    display: block;
+    padding: 8px 12px;
+    color: var(--text-secondary);
+    text-decoration: none;
+    font-size: 0.85rem;
+    border-bottom: 1px solid var(--border-primary);
+    cursor: pointer;
+}
+
+:root[data-ui="none"] .menu-item {
+    color: #000;
+    border-bottom: 1px solid #eee;
+}
+
+.menu-item:last-child {
+    border-bottom: none;
+}
+
+.menu-item:hover {
+    background: var(--bg-tertiary);
+    color: var(--text-primary);
+}
+
+.menu-checkbox {
+    display: inline-block;
+    width: 16px;
+    font-family: monospace;
+    color: var(--text-link);
+}
+
+.theme-toggle,
+.reset-toggle,
+.back-button {
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    padding: 8px 12px;
+    border-radius: 4px;
+    color: var(--text-secondary);
+    cursor: pointer;
+    font-family: inherit;
+    font-size: 0.9rem;
+    user-select: none;
+}
+
+.back-button {
+    text-decoration: none;
+    display: inline-block;
+}
+
+.theme-toggle:hover,
+.reset-toggle:hover,
+.back-button:hover {
+    color: var(--text-primary);
+    background: var(--bg-tertiary);
+}
+
+.system-info {
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    border-radius: 4px;
+    padding: 8px 12px;
+    margin-bottom: 16px;
+    font-size: 0.85em;
+    color: var(--text-secondary);
+}
+
+.system-info-header {
+    font-weight: 600;
+    color: var(--text-primary);
+    margin-bottom: 2px;
+}
+
+.system-info-content {
+    font-family: monospace;
+}
+
+.theme-toggle,
+.reset-toggle {
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    border-radius: 2px;
+    /* padding: 0.4rem 0.6rem; */
+    cursor: pointer;
+    font-family: inherit;
+    font-size: 0.8rem;
+    color: var(--text-secondary);
+    user-select: none;
+    transition: all 0.2s ease;
+    text-transform: lowercase;
+    letter-spacing: 0;
+}
+
+.theme-toggle:hover,
+.reset-toggle:hover {
+    background: var(--bg-tertiary);
+    border-color: var(--text-secondary);
+    color: var(--text-primary);
+}
+
+.minimap {
+    position: fixed;
+    bottom: 20px;
+    right: 20px;
+    width: 220px;
+    max-height: 400px;
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    border-radius: 2px;
+    padding: 0.5rem;
+    font-size: 0.7rem;
+    overflow-y: auto;
+    z-index: 100;
+    opacity: 0.9;
+    transition: opacity 0.2s ease;
+}
+
+/* Hide widgets and controls when disabled via frontmatter */
+:root[data-widgets="off"] .controls,
+:root[data-widgets="off"] .minimap,
+:root[data-widgets="off"] .file-explorer,
+:root[data-widgets="off"] .tools-widget,
+:root[data-widgets="off"] .status-widget {
+    display: none !important;
+}
+
+.file-explorer {
+    position: fixed;
+    bottom: 20px;
+    /* default; JS will stack */
+    right: 20px;
+    left: auto;
+    top: auto;
+    width: 220px;
+    max-height: 400px;
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    border-radius: 2px;
+    padding: 0.5rem;
+    font-size: 0.7rem;
+    overflow-y: auto;
+    z-index: 100;
+    opacity: 0.9;
+    transition: opacity 0.2s ease;
+}
+
+/* Drawing overlay */
+.draw-overlay {
+    position: fixed;
+    top: 0;
+    left: 0;
+    width: 100vw;
+    height: 100vh;
+    z-index: 80;
+    /* under widgets (100) and controls (1000) */
+    display: block;
+    pointer-events: none;
+    /* enabled only when a tool is active */
+}
+
+/* Tools widget */
+.tools-widget {
+    position: fixed;
+    bottom: 20px;
+    /* default; JS will stack */
+    right: 20px;
+    left: auto;
+    top: auto;
+    width: 220px;
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    border-radius: 2px;
+    padding: 0.5rem;
+    font-size: 0.7rem;
+    z-index: 100;
+    opacity: 0.95;
+}
+
+.tools-title {
+    font-weight: bold;
+    color: var(--text-secondary);
+    margin-bottom: 0.5rem;
+    padding-bottom: 0.25rem;
+    border-bottom: 1px solid var(--border-primary);
+    cursor: grab;
+    user-select: none;
+}
+
+.tools-row {
+    display: flex;
+    gap: 0.4rem;
+    flex-wrap: wrap;
+}
+
+.tool-button {
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-primary);
+    border-radius: 2px;
+    padding: 0.25rem 0.4rem;
+    cursor: pointer;
+    color: var(--text-secondary);
+    font-family: inherit;
+    font-size: 0.75rem;
+    user-select: none;
+}
+
+.tool-button:hover {
+    color: var(--text-primary);
+}
+
+.tool-button.active {
+    color: var(--text-primary);
+    border-color: var(--text-secondary);
+    background: var(--bg-secondary);
+}
+
+.minimap:hover,
+.file-explorer:hover {
+    opacity: 1;
+}
+
+.minimap-title {
+    font-weight: bold;
+    color: var(--text-secondary);
+    margin-bottom: 0.5rem;
+    padding-bottom: 0.25rem;
+    border-bottom: 1px solid var(--border-primary);
+    cursor: grab;
+    /* drag handle */
+    user-select: none;
+}
+
+.minimap-item {
+    display: block;
+    color: var(--text-secondary);
+    text-decoration: none;
+    padding: 0.15rem 0;
+    border-left: 2px solid transparent;
+    padding-left: 0.5rem;
+    transition: all 0.2s ease;
+    cursor: pointer;
+}
+
+.minimap-item:hover {
+    color: var(--text-primary);
+    border-left-color: var(--text-secondary);
+}
+
+.minimap-item.active {
+    color: var(--text-primary);
+    border-left-color: var(--text-link);
+}
+
+.minimap-heading {
+    font-weight: normal;
+}
+
+.minimap-heading.h1 {
+    padding-left: 0.5rem;
+}
+
+.minimap-heading.h2 {
+    padding-left: 1rem;
+}
+
+.minimap-heading.h3 {
+    padding-left: 1.5rem;
+}
+
+.minimap-heading.h4 {
+    padding-left: 2rem;
+}
+
+.minimap-heading.h5 {
+    padding-left: 2.5rem;
+}
+
+.minimap-heading.h6 {
+    padding-left: 3rem;
+}
+
+.minimap-cell {
+    color: var(--text-link);
+    opacity: 0.8;
+    font-style: italic;
+}
+
+.minimap-cell:hover {
+    opacity: 1;
+}
+
+.file-explorer-title {
+    font-weight: bold;
+    color: var(--text-secondary);
+    margin-bottom: 0.5rem;
+    padding-bottom: 0.25rem;
+    border-bottom: 1px solid var(--border-primary);
+    cursor: grab;
+    /* drag handle */
+    user-select: none;
+}
+
+.file-explorer-section {
+    margin-bottom: 0.75rem;
+}
+
+.file-explorer-section-title {
+    font-weight: bold;
+    color: var(--text-secondary);
+    font-size: 0.65rem;
+    margin-bottom: 0.25rem;
+    text-transform: uppercase;
+    letter-spacing: 0.5px;
+}
+
+.file-explorer-item {
+    display: block;
+    color: var(--text-secondary);
+    text-decoration: none;
+    padding: 0.1rem 0;
+    margin-left: 0.5rem;
+    transition: color 0.2s ease;
+    cursor: pointer;
+    font-family: monospace;
+}
+
+.file-explorer-item:hover {
+    color: var(--text-primary);
+}
+
+.file-explorer-item.script {
+    color: var(--text-link);
+}
+
+.file-explorer-item.artifact {
+    color: var(--text-secondary);
+    opacity: 0.8;
+}
+
+
+/* Hide widgets on smaller screens */
+@media (max-width: 768px) {
+
+    .minimap,
+    .file-explorer,
+    .tools-widget {
+        display: none;
+    }
+}
+
+.cell {
+    margin: 1rem 0;
+    border: 1px solid var(--border-primary);
+    border-radius: 2px;
+    overflow: hidden;
+    background: var(--bg-secondary);
+}
+
+:root[data-ui="none"] .cell {
+    margin: 1em 0;
+    border: none;
+    background: transparent;
+}
+
+.cell-header {
+    background: var(--bg-secondary);
+    padding: 0.5rem 1rem;
+    border-bottom: 1px solid var(--border-primary);
+    font-family: inherit;
+    font-size: 0.85rem;
+}
+
+:root[data-ui="none"] .cell-header {
+    background: transparent;
+    border: none;
+    padding: 0;
+    font-weight: bold;
+}
+
+:root[data-ui="none"] .cell-content {
+    padding: 0;
+}
+
+:root[data-ui="none"] .copy-button,
+:root[data-ui="none"] .collapse-indicators,
+:root[data-ui="none"] .cell-meta,
+:root[data-ui="none"] .cell-outputs-header {
+    display: none !important;
+}
+
+:root[data-ui="none"] pre,
+:root[data-ui="none"] code {
+    font-family: Menlo, Monaco, 'Courier New', monospace;
+}
+
+:root[data-ui="none"] .code-content pre {
+    background: #f9f9f9;
+    border: 1px solid #ddd;
+    padding: 8px;
+}
+
+:root[data-ui="none"] .output {
+    background: transparent;
+    border: none;
+    padding: 0.25em 0;
+}
+
+color: var(--text-secondary);
+cursor: pointer;
+user-select: none;
+transition: background-color 0.2s ease;
+}
+
+.cell-header:hover {
+    background: var(--bg-tertiary);
+}
+
+.collapse-indicators {
+    color: var(--text-secondary);
+    font-size: 0.8rem;
+    opacity: 0.7;
+}
+
+.collapse-indicators span:hover {
+    color: var(--text-primary);
+    opacity: 1;
+}
+
+.cell-code {
+    display: block;
+    background: var(--bg-code);
+}
+
+.cell-code.collapsed {
+    display: none;
+}
+
+.cell-code pre {
+    margin: 0;
+    padding: 0.75rem;
+    background: var(--bg-code);
+    overflow-x: auto;
+    color: var(--text-primary);
+}
+
+.cell-output {
+    padding: 0.75rem;
+    /* background: var(--bg-primary); */
+    background: var(--bg-secondary);
+}
+
+.cell-output.collapsed {
+    display: none;
+}
+
+.cell-stdout {
+    background: var(--bg-tertiary);
+    padding: 0.75rem;
+    border-radius: 1px;
+    /* margin: 0.25rem 0; */
+    font-family: inherit;
+    font-size: 0.9rem;
+    white-space: pre-wrap;
+    color: var(--text-primary);
+}
+
+.cell-stdout {
+    background: var(--bg-tertiary);
+    padding: 0.75rem;
+    border-radius: 1px;
+    font-family: inherit;
+    font-size: 0.9rem;
+    color: var(--text-primary);
+
+    /* key bits */
+    overflow: auto;
+    /* show scrollbars when needed */
+    max-width: 100%;
+    /* respects whatever layout width you give it */
+}
+
+.cell-stdout .stdout-text {
+    margin: 0;
+    /* reset pre default margin */
+    white-space: pre;
+    /* keep line breaks, NO wrapping */
+    display: inline-block;
+    /* shrink-to-content */
+    min-width: max-content;
+    /* allow very long lines to define intrinsic width */
+    font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
+    tab-size: 2;
+}
+
+.cell-stderr {
+    background: var(--bg-error);
+    border-left: 2px solid var(--border-error);
+    padding: 1rem;
+    margin: 0.5rem 0;
+    font-family: inherit;
+    font-size: 0.9rem;
+    color: var(--text-error);
+    white-space: pre-wrap;
+}
+
+.uv-install-logs {
+    margin: 0.5rem 0;
+}
+
+.uv-logs-header {
+    cursor: pointer;
+    padding: 0.75rem;
+    border-left: 3px solid var(--border-color);
+    font-family: inherit;
+    font-size: 0.85rem;
+    color: var(--text-secondary);
+    user-select: none;
+}
+
+.uv-logs-content {
+    background: var(--bg-secondary);
+    padding: 1rem;
+    border-left: 3px solid var(--border-color);
+    white-space: pre-wrap;
+    font-family: monospace;
+    font-size: 0.85rem;
+    color: var(--text-secondary);
+    overflow-x: auto;
+}
+
+.cell-artifacts {
+    margin: 1rem 0;
+}
+
+.cell-artifacts h4 {
+    margin: 0 0 0.5rem 0;
+    color: var(--text-secondary);
+    font-size: 0.9rem;
+}
+
+.artifact {
+    display: inline-block;
+    background: var(--bg-artifact);
+    padding: 0.25rem 0.5rem;
+    border-radius: 1px;
+    margin: 0.25rem 0.5rem 0.25rem 0;
+    font-family: inherit;
+    font-size: 0.8rem;
+    color: var(--text-link);
+    text-decoration: none;
+    transition: background-color 0.2s ease;
+    border: 1px solid var(--border-primary);
+}
+
+.artifact:hover {
+    background: var(--bg-artifact-hover);
+}
+
+.artifact-preview {
+    margin-top: 1rem;
+}
+
+.artifact-preview img {
+    max-width: 100%;
+    height: auto;
+    border: 1px solid var(--border-primary);
+    border-radius: 1px;
+}
+
+.artifact-preview svg {
+    max-width: 100%;
+    height: auto;
+    border: 1px solid var(--border-primary);
+    border-radius: 1px;
+    display: block;
+}
+
+/* Style SVG text elements */
+.artifact-preview svg g {
+    fill: var(--text-primary) !important;
+}
+
+/* Auto-theme SVG elements */
+.artifact-preview svg {
+    background: transparent;
+}
+
+/* Invert SVG images in dark mode */
+:root[data-theme="dark"] .artifact-preview img[src$=".svg"] {
+    filter: invert(0.9) hue-rotate(180deg);
+}
+
+/* Keep SVG images readable in monocolor mode */
+:root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] {
+    filter: none;
+}
+
+/* CSV table styling */
+.artifact-csv {
+    margin-top: 1rem;
+    overflow-x: auto;
+}
+
+.csv-table {
+    width: 100%;
+    border-collapse: collapse;
+    font-size: 0.9rem;
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-primary);
+    border-radius: 1px;
+}
+
+.csv-table th,
+.csv-table td {
+    padding: 0.5rem 0.75rem;
+    text-align: left;
+    border: 1px solid var(--border-primary);
+}
+
+.csv-table th {
+    background: var(--bg-tertiary);
+    font-weight: 600;
+    color: var(--text-primary);
+}
+
+.csv-table tbody tr:hover {
+    background: var(--bg-artifact-hover);
+}
+
+.artifact-csv-error {
+    margin-top: 1rem;
+    padding: 1rem;
+    background: var(--bg-error);
+    color: var(--text-error);
+    border: 1px solid var(--border-error);
+    border-radius: 1px;
+}
+
+.cell-failed {
+    border-color: var(--border-cell-failed);
+}
+
+.cell-failed .cell-header {
+    background: var(--bg-error);
+    color: var(--text-error);
+}
+
+.cell-commented {
+    opacity: 0.6;
+    border-style: dashed;
+}
+
+.cell-commented .cell-header {
+    background: var(--bg-secondary);
+    color: var(--text-secondary);
+    font-style: italic;
+}
+
+.run-btn {
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-primary);
+    padding: 2px 6px;
+    border-radius: 2px;
+    color: var(--text-secondary);
+    cursor: pointer;
+    font-size: 0.75em;
+    font-family: inherit;
+    margin-left: 4px;
+}
+
+.run-btn:hover {
+    color: var(--text-primary);
+    background: var(--bg-primary);
+}
+
+.run-btn:disabled {
+    opacity: 0.6;
+    cursor: not-allowed;
+}
+
+.copy-btn {
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-primary);
+    padding: 2px 6px;
+    border-radius: 2px;
+    color: var(--text-secondary);
+    cursor: pointer;
+    font-size: 0.75em;
+    font-family: inherit;
+    margin-left: 4px;
+}
+
+.copy-btn:hover {
+    color: var(--text-primary);
+    background: var(--bg-primary);
+}
+
+.copy-btn:disabled {
+    opacity: 0.6;
+    cursor: not-allowed;
+}
+
+.copy-btn.copied {
+    color: #4caf50;
+    background: var(--bg-primary);
+    border-color: #4caf50;
+    transition: all 0.2s ease;
+}
+
+.raw-btn {
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-primary);
+    padding: 2px 6px;
+    border-radius: 2px;
+    color: var(--text-secondary);
+    cursor: pointer;
+    font-size: 0.75em;
+    font-family: inherit;
+    margin-left: 4px;
+    text-decoration: none;
+    display: inline-block;
+}
+
+.raw-btn:hover {
+    color: var(--text-primary);
+    background: var(--bg-primary);
+    text-decoration: none;
+}
+
+.github-btn {
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-primary);
+    padding: 2px 6px;
+    border-radius: 2px;
+    color: var(--text-secondary);
+    cursor: pointer;
+    font-size: 0.75em;
+    font-family: inherit;
+    margin-left: 4px;
+    text-decoration: none;
+    display: inline-block;
+}
+
+.github-btn:hover {
+    color: var(--text-primary);
+    background: var(--bg-primary);
+    text-decoration: none;
+}
+
+.hf-btn {
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-primary);
+    padding: 2px 6px;
+    border-radius: 2px;
+    color: var(--text-secondary);
+    cursor: pointer;
+    font-size: 0.75em;
+    font-family: inherit;
+    margin-left: 4px;
+    text-decoration: none;
+    display: inline-block;
+}
+
+.hf-btn:hover {
+    color: var(--text-primary);
+    background: var(--bg-primary);
+    text-decoration: none;
+}
+
+.output-stale {
+    opacity: 0.5;
+    position: relative;
+}
+
+.output-stale::after {
+    content: '⏳ updating...';
+    position: absolute;
+    top: 8px;
+    right: 8px;
+    background: var(--bg-secondary);
+    padding: 4px 8px;
+    border-radius: 2px;
+    font-size: 0.75em;
+    color: var(--text-secondary);
+    border: 1px solid var(--border-primary);
+}
+
+h1,
+h2,
+h3,
+h4,
+h5,
+h6 {
+    margin-top: 1.5rem;
+    margin-bottom: 0.75rem;
+    color: var(--text-primary);
+}
+
+h1 {
+    margin-top: 0;
+    margin-bottom: 1rem;
+}
+
+p {
+    margin: 0.75rem 0;
+    color: var(--text-primary);
+}
+
+a {
+    color: var(--text-link);
+}
+
+img {
+    max-width: 100%;
+    height: auto;
+    border-radius: 1px;
+    box-shadow: none;
+}
+
+pre,
+code {
+    font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace;
+    font-size: var(--code-font-size);
+}
+
+.code-wrap {
+    position: relative;
+}
+
+.code-line-highlight {
+    display: none;
+    position: absolute;
+    left: 0;
+    right: 0;
+    height: 1.5em;
+    background: rgba(255, 235, 170, 0.35);
+    pointer-events: none;
+    border-left: 3px solid #f4c542;
+}
+
+.line-number {
+    cursor: pointer;
+    text-decoration: none;
+    color: var(--text-secondary);
+    padding: 0 0.25rem;
+}
+
+.line-number.selected {
+    background: rgba(255, 235, 170, 0.4);
+    color: var(--text-primary);
+}
+
+/* Line numbers */
+.highlight-with-lines {
+    display: flex;
+}
+
+.line-numbers {
+    background: var(--bg-tertiary);
+    padding: var(--code-pad-y) 0.5rem;
+    font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace;
+    font-size: var(--code-font-size);
+    line-height: var(--code-line-height);
+    color: var(--text-secondary);
+    user-select: none;
+    text-align: right;
+    border-right: 1px solid var(--border-primary);
+}
+
+.line-numbers .line-number {
+    display: block;
+    line-height: var(--code-line-height);
+}
+
+.highlight-with-lines .highlight {
+    flex: 1;
+}
+
+.highlight .hll {
+    background-color: transparent;
+}
+
+/* don't conflict with our highlight */
+.highlight pre {
+    white-space: pre;
+    margin: 0;
+    padding: var(--code-pad-y) 0.75rem;
+    line-height: var(--code-line-height);
+}
+
+/* Collapsed code styling */
+.cell-code.collapsed {
+    display: none;
+}
+
+.cell-code.expanded {
+    display: block;
+}
+
+    {
+    % if config.collapse_code %
+}
+
+.cell-code {
+    display: none;
+}
+
+    {
+    % else %
+}
+
+.cell-code {
+    display: block;
+    border-bottom: 1px solid var(--border-primary);
+}
+
+    {
+    % endif %
+}
+
+    {
+        {
+        pygments_css
+    }
+}
+
+/* Ensure our code metrics override Pygments defaults */
+.highlight pre {
+    white-space: pre;
+    margin: 0;
+    padding: var(--code-pad-y) 0.75rem !important;
+    line-height: var(--code-line-height) !important;
+    font-size: var(--code-font-size) !important;
+    font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace !important;
+    border: none;
+}
+
+.line-numbers {
+    line-height: var(--code-line-height) !important;
+}
+
+.line-numbers .line-number {
+    line-height: var(--code-line-height) !important;
+}
+
+/* Custom CSS from frontmatter */
+    {
+        {
+        config.custom_css
+    }
+}
+
+    {
+    # Override code font size from frontmatter (accept number as px) #
+}
+
+    {
+    % if config.code_font_size is not none %
+}
+
+    {
+    % if config.code_font_size is string %
+}
+
+:root {
+    --code-font-size: {
+            {
+            config.code_font_size
+        }
+    }
+
+    ;
+}
+
+    {
+    % else %
+}
+
+:root {
+    --code-font-size: {
+            {
+            config.code_font_size
+        }
+    }
+
+    px;
+}
+
+    {
+    % endif %
+}
+
+    {
+    % endif %
+}
+
+/* Cursor for tools */
+body[data-tool="arrow"] .main-content {
+    cursor: url('data:image/svg+xml;utf8,<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="%23e53935" stroke-width="2"><path d="M12 19l7-7 3 3-7 7-3-3z"/><path d="M18 13l-1.5-7.5L2 2l3.5 14.5L13 18l5-5z"/><path d="M2 2l7.586 7.586"/><circle cx="11" cy="11" r="2"/></svg>') 12 12, crosshair;
+}
+
+body[data-tool="pen"] .main-content {
+    cursor: url('data:image/svg+xml;utf8,<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="%23e53935" stroke-width="2"><path d="M12 19l7-7 3 3-7 7-3-3z"/><path d="M18 13l-1.5-7.5L2 2l3.5 14.5L13 18l5-5z"/><circle cx="4" cy="20" r="2" fill="%23e53935"/></svg>') 4 20, pointer;
+}
+
+body[data-tool="eraser"] .main-content {
+    cursor: url('data:image/svg+xml;utf8,<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="%23e53935" stroke-width="2"><path d="M20 20H7l-7-7 7-7h13v14z"/><path d="M13 13l7-7"/><path d="M13 13L9 9"/></svg>') 12 12, auto;
+}
+
+/* Color picker styles */
+.tools-section-title {
+    font-weight: bold;
+    color: var(--text-secondary);
+    font-size: 0.65rem;
+    margin: 0.75rem 0 0.5rem 0;
+    text-transform: uppercase;
+    letter-spacing: 0.5px;
+}
+
+.color-row {
+    display: grid;
+    grid-template-columns: repeat(6, 1fr);
+    gap: 0.25rem;
+    margin-bottom: 0.5rem;
+}
+
+.color-swatch {
+    width: 18px;
+    height: 18px;
+    border: 2px solid var(--border-primary);
+    border-radius: 3px;
+    cursor: pointer;
+    transition: all 0.2s ease;
+    position: relative;
+}
+
+.color-swatch:hover {
+    transform: scale(1.1);
+    border-color: var(--text-secondary);
+}
+
+.color-swatch.selected {
+    border-color: var(--text-primary);
+    box-shadow: 0 0 0 2px var(--text-link);
+}
+
+.color-swatch.selected::after {
+    content: '✓';
+    position: absolute;
+    top: 50%;
+    left: 50%;
+    transform: translate(-50%, -50%);
+    color: white;
+    font-size: 10px;
+    font-weight: bold;
+    text-shadow: 1px 1px 1px black;
+}
+
+.color-input {
+    width: 24px;
+    height: 24px;
+    border: 2px solid var(--border-primary);
+    border-radius: 3px;
+    cursor: pointer;
+    background: none;
+    padding: 0;
+    grid-column: span 2;
+    justify-self: center;
+}
+
+.color-input:hover {
+    border-color: var(--text-secondary);
+}
+
+/* Thickness slider styles */
+.thickness-row {
+    display: flex;
+    align-items: center;
+    gap: 0.5rem;
+    margin-top: 0.75rem;
+}
+
+.thickness-slider {
+    flex: 1;
+    -webkit-appearance: none;
+    appearance: none;
+    height: 4px;
+    background: var(--border-primary);
+    border-radius: 2px;
+    outline: none;
+    opacity: 0.7;
+    transition: opacity 0.2s;
+}
+
+.thickness-slider:hover {
+    opacity: 1;
+}
+
+.thickness-slider::-webkit-slider-thumb {
+    -webkit-appearance: none;
+    appearance: none;
+    width: 12px;
+    height: 12px;
+    background: var(--text-link);
+    border-radius: 50%;
+    cursor: pointer;
+}
+
+.thickness-slider::-moz-range-thumb {
+    width: 12px;
+    height: 12px;
+    background: var(--text-link);
+    border-radius: 50%;
+    cursor: pointer;
+    border: none;
+}
+
+.thickness-value {
+    font-size: 0.7rem;
+    color: var(--text-secondary);
+    min-width: 20px;
+    text-align: right;
+}
+
+.highlight {
+    background: none !important;
+}
+
+/* Loading animations */
+.loading-spinner {
+    display: inline-block;
+    width: 16px;
+    height: 16px;
+    border: 2px solid var(--border-primary);
+    border-radius: 50%;
+    border-top-color: var(--text-link);
+    animation: spin 1s linear infinite;
+    margin-right: 8px;
+    vertical-align: middle;
+}
+
+@keyframes spin {
+    to {
+        transform: rotate(360deg);
+    }
+}
+
+.loading-skeleton {
+    display: inline-block;
+    background: var(--bg-tertiary);
+    background: linear-gradient(90deg,
+            var(--bg-tertiary) 25%,
+            var(--bg-secondary) 50%,
+            var(--bg-tertiary) 75%);
+    background-size: 200% 100%;
+    animation: loading-shimmer 2s ease-in-out infinite;
+    border-radius: 2px;
+    height: 1em;
+    width: 80px;
+    vertical-align: middle;
+}
+
+@keyframes loading-shimmer {
+    0% {
+        background-position: -200% 0;
+    }
+
+    100% {
+        background-position: 200% 0;
+    }
+}
+
+/* Loading state for cell output */
+.cell-output:has(.loading-spinner) {
+    opacity: 0.7;
+    background: var(--bg-secondary);
+    /* border-left: 3px solid var(--text-link); */
+}
+    </style>
+    <script>
+// --- Drag utilities ---
+        function clamp(val, min, max) { return Math.max(min, Math.min(max, val)); }
+
+        function restorePosition(el, storageKey) {
+            try {
+                const raw = localStorage.getItem(storageKey);
+                if (!raw) return;
+                const pos = JSON.parse(raw);
+                if (typeof pos.left === 'number' && typeof pos.top === 'number') {
+                    el.style.left = pos.left + 'px';
+                    el.style.top = pos.top + 'px';
+                    el.style.right = 'auto';
+                    el.style.bottom = 'auto';
+                }
+            } catch (_) {}
+        }
+
+        function savePosition(el, storageKey) {
+            try {
+                const left = parseFloat(el.style.left || 'NaN');
+                const top = parseFloat(el.style.top || 'NaN');
+                if (!Number.isNaN(left) && !Number.isNaN(top)) {
+                    localStorage.setItem(storageKey, JSON.stringify({ left, top }));
+                }
+            } catch (_) {}
+        }
+
+
+        function makeDraggable(el, storageKey, handleEl) {
+            let dragging = false;
+            let startX = 0, startY = 0; // cursor
+            let origLeft = 0, origTop = 0; // element
+
+            const onMove = (e) => {
+                if (!dragging) return;
+                const clientX = e.touches ? e.touches[0].clientX : e.clientX;
+                const clientY = e.touches ? e.touches[0].clientY : e.clientY;
+                const dx = clientX - startX;
+                const dy = clientY - startY;
+                const w = el.offsetWidth;
+                const h = el.offsetHeight;
+                const maxX = window.innerWidth - w;
+                const maxY = window.innerHeight - h;
+                const newLeft = clamp(origLeft + dx, 0, maxX);
+                const newTop = clamp(origTop + dy, 0, maxY);
+                el.style.left = newLeft + 'px';
+                el.style.top = newTop + 'px';
+                el.style.right = 'auto';
+                el.style.bottom = 'auto';
+            };
+
+            const endDrag = () => {
+                if (!dragging) return;
+                dragging = false;
+                document.removeEventListener('mousemove', onMove);
+                document.removeEventListener('mouseup', endDrag);
+                document.removeEventListener('touchmove', onMove);
+                document.removeEventListener('touchend', endDrag);
+                handleEl && (handleEl.style.cursor = 'grab');
+                savePosition(el, storageKey);
+                // ensure no-overlap constraint after a drag
+                try { layoutWidgetsStackedBottomRight(); } catch (_) {}
+            };
+
+            const startDrag = (e) => {
+                // Start from element's current on-screen rect
+                const elRect = el.getBoundingClientRect();
+                el.style.left = elRect.left + 'px';
+                el.style.top = elRect.top + 'px';
+                el.style.right = 'auto';
+                el.style.bottom = 'auto';
+
+                dragging = true;
+                startX = e.touches ? e.touches[0].clientX : e.clientX;
+                startY = e.touches ? e.touches[0].clientY : e.clientY;
+                origLeft = elRect.left;
+                origTop = elRect.top;
+
+                document.addEventListener('mousemove', onMove);
+                document.addEventListener('mouseup', endDrag);
+                document.addEventListener('touchmove', onMove, { passive: false });
+                document.addEventListener('touchend', endDrag);
+                handleEl && (handleEl.style.cursor = 'grabbing');
+                e.preventDefault();
+            };
+
+            (handleEl || el).addEventListener('mousedown', startDrag);
+            (handleEl || el).addEventListener('touchstart', startDrag, { passive: false });
+
+            // Apply any saved position on init
+            restorePosition(el, storageKey);
+        }
+        function toggleCell(cellId) {
+            const codeElement = document.getElementById('code-' + cellId);
+            const outputElement = document.getElementById('output-' + cellId);
+            
+            if (codeElement) {
+                codeElement.classList.toggle('collapsed');
+            }
+            if (outputElement) {
+                outputElement.classList.toggle('collapsed');
+            }
+            
+            updateIndicators(cellId);
+            encodeToolStateToUrl();
+        }
+        
+        function toggleCode(cellId) {
+            const codeElement = document.getElementById('code-' + cellId);
+            if (codeElement) {
+                codeElement.classList.toggle('collapsed');
+                updateIndicators(cellId);
+                encodeToolStateToUrl();
+            }
+        }
+        
+        function toggleOutput(cellId) {
+            const outputElement = document.getElementById('output-' + cellId);
+            if (outputElement) {
+                outputElement.classList.toggle('collapsed');
+                updateIndicators(cellId);
+                encodeToolStateToUrl();
+            }
+        }
+
+        function toggleUvLogs(headerElement) {
+            const contentElement = headerElement.nextElementSibling;
+            if (contentElement) {
+                const isCollapsed = contentElement.style.display === 'none';
+                contentElement.style.display = isCollapsed ? 'block' : 'none';
+                headerElement.textContent = isCollapsed ? '▼ UV Install Logs' : '▶ UV Install Logs';
+
+                // Update the header indicator if it exists
+                const uvLogsDiv = headerElement.parentElement;
+                if (uvLogsDiv && uvLogsDiv.id && uvLogsDiv.id.startsWith('uv-logs-')) {
+                    const cellId = uvLogsDiv.id.replace('uv-logs-', '');
+                    const indicatorElement = document.getElementById('uv-indicator-' + cellId);
+                    if (indicatorElement) {
+                        indicatorElement.textContent = isCollapsed ? '▼ uv-logs' : '▶ uv-logs';
+                    }
+                }
+            }
+        }
+
+        function toggleUvLogsFromHeader(cellId) {
+            const uvLogsElement = document.getElementById('uv-logs-' + cellId);
+            const indicatorElement = document.getElementById('uv-indicator-' + cellId);
+            if (uvLogsElement) {
+                const headerElement = uvLogsElement.querySelector('.uv-logs-header');
+                const contentElement = uvLogsElement.querySelector('.uv-logs-content');
+                if (contentElement && headerElement) {
+                    const isCollapsed = contentElement.style.display === 'none';
+                    contentElement.style.display = isCollapsed ? 'block' : 'none';
+                    headerElement.textContent = isCollapsed ? '▼ UV Install Logs' : '▶ UV Install Logs';
+                    if (indicatorElement) {
+                        indicatorElement.textContent = isCollapsed ? '▼ uv-logs' : '▶ uv-logs';
+                    }
+                }
+            }
+        }
+        
+        function updateIndicators(cellId) {
+            const codeElement = document.getElementById('code-' + cellId);
+            const outputElement = document.getElementById('output-' + cellId);
+            const indicators = document.querySelector(`[onclick*="${cellId}"]`)?.closest('.cell-header')?.querySelector('.collapse-indicators');
+            
+            if (indicators) {
+                const codeCollapsed = codeElement && codeElement.classList.contains('collapsed');
+                const outputCollapsed = outputElement && outputElement.classList.contains('collapsed');
+                
+                const codeIcon = codeCollapsed ? '▶' : '▼';
+                const outputIcon = outputCollapsed ? '▶' : '▼';
+                
+                const codeSpan = indicators.querySelector('[onclick*="toggleCode"]');
+                const outputSpan = indicators.querySelector('[onclick*="toggleOutput"]');
+                
+                if (codeSpan) codeSpan.innerHTML = `${codeIcon} code`;
+                if (outputSpan) outputSpan.innerHTML = `${outputIcon} output`;
+            }
+        }
+        
+        function toggleTheme() {
+            const html = document.documentElement;
+            const currentTheme = html.getAttribute('data-theme');
+            const newTheme = currentTheme === 'dark' ? 'light' : 'dark';
+            html.setAttribute('data-theme', newTheme);
+            localStorage.setItem('uvnote-theme', newTheme);
+            updateThemeIcon();
+            updateUiDebug();
+        }
+        
+        // Two panel code removed
+        
+        function updateThemeIcon() {
+            const theme = document.documentElement.getAttribute('data-theme');
+            const toggle = document.querySelector('.theme-toggle');
+            if (toggle) {
+                toggle.textContent = theme === 'dark' ? 'light' : 'dark';
+            }
+        }
+        function setUiTheme(newUi) {
+            if (newUi !== 'default' && newUi !== 'none' && newUi !== 'monocolor') return;
+            const html = document.documentElement;
+            html.setAttribute('data-ui', newUi);
+            try { localStorage.setItem('uvnote-ui', newUi); } catch (_) {}
+            updateUiMenu();
+            updateUiDebug();
+        }
+        function updateUiMenu() {
+            const ui = document.documentElement.getAttribute('data-ui') || 'default';
+            const checks = {
+                default: document.getElementById('checkbox-ui-default'),
+                none: document.getElementById('checkbox-ui-none'),
+                monocolor: document.getElementById('checkbox-ui-monocolor')
+            };
+            if (checks.default) checks.default.textContent = ui === 'default' ? '☑' : '☐';
+            if (checks.none) checks.none.textContent = ui === 'none' ? '☑' : '☐';
+            if (checks.monocolor) checks.monocolor.textContent = ui === 'monocolor' ? '☑' : '☐';
+        }
+
+        function updateUiDebug() {
+            const ui = document.documentElement.getAttribute('data-ui') || 'default';
+            const color = document.documentElement.getAttribute('data-theme') || 'light';
+            const el = document.getElementById('ui-debug');
+            if (el) {
+                el.textContent = `UI: ${ui} | Color: ${color}`;
+            }
+        }
+
+        // Line selection and deep-linking
+        function clearLineSelections() {
+            try {
+                document.querySelectorAll('.code-line-highlight').forEach(el => { el.style.display = 'none'; });
+                document.querySelectorAll('.line-number.selected').forEach(el => el.classList.remove('selected'));
+            } catch (_) {}
+        }
+
+        let _selection = null; // { cellId, a, b }
+
+        function clearSelection(updateUrl) {
+            clearLineSelections();
+            _selection = null;
+            if (updateUrl) {
+                try {
+                    const url = new URL(window.location.href);
+                    url.searchParams.delete('cell');
+                    url.searchParams.delete('line');
+                    history.replaceState(null, '', url.toString());
+                } catch (_) {}
+            }
+            updateStateIndicator();
+        }
+
+        function selectCellLine(cellId, line, updateUrl) {
+            try {
+                // Ensure only one selection across the whole document
+                clearLineSelections();
+                const codeBox = document.getElementById(`code-${cellId}`);
+                if (!codeBox) return;
+                const pre = codeBox.querySelector('.highlight pre');
+                const overlay = document.getElementById(`line-highlight-${cellId}`);
+                const numbers = document.getElementById(`lines-${cellId}`);
+                if (!pre || !overlay) return;
+
+                // Measure line height directly from computed style
+                const preStyle = getComputedStyle(pre);
+                const padTop = parseFloat(preStyle.paddingTop || '0');
+                const lh = parseFloat(preStyle.lineHeight || '20');
+
+                // Position overlay
+                overlay.style.display = 'block';
+                overlay.style.height = `${lh}px`;
+                overlay.style.top = `${pre.offsetTop + padTop + (line - 1) * lh}px`;
+
+                // Update selected class in line numbers
+                if (numbers) {
+                    numbers.querySelectorAll('.line-number').forEach(a => a.classList.remove('selected'));
+                    const sel = numbers.querySelector(`.line-number[data-line="${line}"]`);
+                    if (sel) sel.classList.add('selected');
+                }
+
+                if (updateUrl) {
+                    const url = new URL(window.location.href);
+                    url.searchParams.set('cell', cellId);
+                    url.searchParams.set('line', String(line));
+                    history.replaceState(null, '', url.toString());
+                }
+                _selection = { cellId, a: line, b: line };
+                updateStateIndicator();
+            } catch (e) { console.warn('selectCellLine error', e); }
+        }
+
+        function selectCellLines(cellId, startLine, endLine, updateUrl) {
+            try {
+                // normalize order
+                const a = Math.min(startLine, endLine);
+                const b = Math.max(startLine, endLine);
+                clearLineSelections();
+                const codeBox = document.getElementById(`code-${cellId}`);
+                if (!codeBox) return;
+                const pre = codeBox.querySelector('.highlight pre');
+                const overlay = document.getElementById(`line-highlight-${cellId}`);
+                const numbers = document.getElementById(`lines-${cellId}`);
+                if (!pre || !overlay) return;
+
+                const preStyle = getComputedStyle(pre);
+                const padTop = parseFloat(preStyle.paddingTop || '0');
+                const lh = parseFloat(preStyle.lineHeight || '20');
+
+                overlay.style.display = 'block';
+                overlay.style.top = `${pre.offsetTop + padTop + (a - 1) * lh}px`;
+                overlay.style.height = `${(b - a + 1) * lh}px`;
+
+                if (numbers) {
+                    numbers.querySelectorAll('.line-number').forEach(a => a.classList.remove('selected'));
+                    for (let i = a; i <= b; i++) {
+                        const el = numbers.querySelector(`.line-number[data-line="${i}"]`);
+                        if (el) el.classList.add('selected');
+                    }
+                }
+
+                if (updateUrl) {
+                    const url = new URL(window.location.href);
+                    url.searchParams.set('cell', cellId);
+                    if (a === b) url.searchParams.set('line', String(a));
+                    else url.searchParams.set('line', `${a}-${b}`);
+                    history.replaceState(null, '', url.toString());
+                }
+                _selection = { cellId, a, b };
+                updateStateIndicator();
+            } catch (e) { console.warn('selectCellLines error', e); }
+        }
+
+        // Drag-to-select support on line numbers
+        let _lineDrag = { active: false, cellId: null, start: 0 };
+        function onLineNumberMouseDown(e) {
+            const a = e.target.closest('.line-number');
+            if (!a) return;
+            e.preventDefault();
+            const cellId = a.dataset.cell;
+            const line = parseInt(a.dataset.line || '1', 10) || 1;
+            // Toggle off if this exact single line is already the only selection
+            const numbers = document.getElementById(`lines-${cellId}`);
+            if (numbers) {
+                const selected = Array.from(numbers.querySelectorAll('.line-number.selected')).map(n => parseInt(n.dataset.line||'0',10)).filter(Boolean);
+                if (selected.length === 1 && selected[0] === line) {
+                    clearSelection(true);
+                    return;
+                }
+            }
+            _lineDrag.active = true;
+            _lineDrag.cellId = cellId;
+            _lineDrag.start = line;
+            selectCellLines(_lineDrag.cellId, _lineDrag.start, _lineDrag.start, false);
+        }
+        function onDocMouseMove(e) {
+            if (!_lineDrag.active) return;
+            const el = document.elementFromPoint(e.clientX, e.clientY);
+            if (!el) return;
+            const a = el.closest && el.closest('.line-number');
+            if (!a) return;
+            if (a.dataset.cell !== _lineDrag.cellId) return;
+            const cur = parseInt(a.dataset.line || '1', 10) || 1;
+            selectCellLines(_lineDrag.cellId, _lineDrag.start, cur, false);
+        }
+        function onDocMouseUp(e) {
+            if (!_lineDrag.active) return;
+            const last = document.querySelector('.line-number.selected:last-of-type');
+            // finalize URL using the current selected range
+            const numbers = document.getElementById(`lines-${_lineDrag.cellId}`);
+            if (numbers) {
+                const selected = Array.from(numbers.querySelectorAll('.line-number.selected')).map(n => parseInt(n.dataset.line||'0',10)).filter(Boolean);
+                if (selected.length) {
+                    const a = Math.min(...selected); const b = Math.max(...selected);
+                    selectCellLines(_lineDrag.cellId, a, b, true);
+                }
+            }
+            _lineDrag.active = false; _lineDrag.cellId = null; _lineDrag.start = 0;
+        }
+
+        function applyLocationFromUrl() {
+            try {
+                const url = new URL(window.location.href);
+                const cell = url.searchParams.get('cell');
+                const lineParam = url.searchParams.get('line');
+                if (cell && lineParam) {
+                    if (lineParam.includes('-')) {
+                        const [a, b] = lineParam.split('-').map(x => parseInt(x, 10));
+                        if (!Number.isNaN(a) && !Number.isNaN(b)) selectCellLines(cell, a, b, false);
+                    } else {
+                        const l = parseInt(lineParam, 10);
+                        if (!Number.isNaN(l)) selectCellLine(cell, l, false);
+                    }
+                }
+                
+                // Apply tool parameters from URL
+                applyToolsFromUrl(url.searchParams);
+                
+                // Cell states will be applied later in DOMContentLoaded with proper timing
+                const encodedCellStates = url.searchParams.get('cells');
+                console.log('Encoded cell states from URL:', encodedCellStates);
+            } catch (_) {}
+        }
+        
+        function applyToolsFromUrl(params) {
+            try {
+                // Check if tools widget should be shown
+                const showTools = params.get('tools');
+                if (showTools === '1') {
+                    // Mark that tool was loaded from URL
+                    _urlLoadedTool = true;
+                    
+                    // Apply color
+                    const color = params.get('color');
+                    if (color && /^[0-9a-fA-F]{6}$/.test(color)) {
+                        setStoredArrowColor('#' + color);
+                    }
+                    
+                    // Apply thickness
+                    const thickness = params.get('thickness');
+                    if (thickness) {
+                        const value = parseInt(thickness, 10);
+                        if (value >= 1 && value <= 10) {
+                            setStoredLineThickness(value);
+                        }
+                    }
+                    
+                    // Don't override fadeout time for URL-loaded tools - let individual shapes decide
+                    
+                    // Load shapes from URL
+                    const encodedShapes = params.get('shapes');
+                    if (encodedShapes) {
+                        const decodedShapes = decodeShapesFromUrl(encodedShapes);
+                        if (decodedShapes.length > 0) {
+                            _shapes = decodedShapes;
+                            saveShapes();
+                            // Trigger render after overlay is initialized
+                            setTimeout(() => {
+                                renderOverlay();
+                            }, 300);
+                        }
+                    }
+                    
+                    // Wait for widgets to be initialized before showing tools
+                    setTimeout(() => {
+                        const toolsWidget = document.querySelector('.tools-widget');
+                        const checkbox = document.getElementById('checkbox-tools');
+                        if (toolsWidget && checkbox) {
+                            toolsWidget.style.display = 'block';
+                            checkbox.textContent = '☑';
+                            localStorage.setItem('uvnote-widget-tools', 'visible');
+                        }
+                        
+                        // Apply active tool
+                        const activeTool = params.get('tool');
+                        if (activeTool && ['arrow', 'pen', 'eraser', 'spotlight'].includes(activeTool)) {
+                            const toolBtn = Array.from(document.querySelectorAll('.tool-button')).find(btn => btn.textContent === activeTool);
+                            if (toolBtn) {
+                                toolBtn.click();
+                            }
+                        }
+                        
+                        // Re-layout widgets after showing tools
+                        layoutWidgetsStackedBottomRight();
+                    }, 200);
+                }
+            } catch (_) {}
+        }
+        
+        function captureInitialCellStates() {
+            const cells = document.querySelectorAll('.cell');
+            cells.forEach(cell => {
+                const cellId = cell.id.replace('cell-', '');
+                const codeEl = document.getElementById('code-' + cellId);
+                const outputEl = document.getElementById('output-' + cellId);
+                
+                if (codeEl || outputEl) {
+                    const state = {};
+                    if (codeEl) {
+                        state.c = codeEl.classList.contains('collapsed') ? 0 : 1;
+                    }
+                    if (outputEl) {
+                        state.o = outputEl.classList.contains('collapsed') ? 0 : 1;
+                    }
+                    _initialCellStates[cellId] = state;
+                }
+            });
+            console.log('Captured initial cell states:', _initialCellStates);
+        }
+
+        function encodeCellStatesToUrl() {
+            // Get all cells and their collapse states
+            const cells = document.querySelectorAll('.cell');
+            const cellStates = {};
+            
+            console.log('Found cells:', cells.length);
+            
+            cells.forEach(cell => {
+                const cellId = cell.id.replace('cell-', '');
+                const codeEl = document.getElementById('code-' + cellId);
+                const outputEl = document.getElementById('output-' + cellId);
+                const initialState = _initialCellStates[cellId] || {};
+                
+                console.log(`Encoding cell ${cellId}:`, {
+                    codeEl: !!codeEl,
+                    outputEl: !!outputEl,
+                    codeCollapsed: codeEl ? codeEl.classList.contains('collapsed') : 'N/A',
+                    outputCollapsed: outputEl ? outputEl.classList.contains('collapsed') : 'N/A',
+                    initialState: initialState
+                });
+                
+                if (codeEl || outputEl) {
+                    const state = {};
+                    let hasChanges = false;
+                    
+                    if (codeEl) {
+                        const currentCodeState = codeEl.classList.contains('collapsed') ? 0 : 1;
+                        const initialCodeState = initialState.c;
+                        // Only encode if different from initial state
+                        if (initialCodeState !== undefined && currentCodeState !== initialCodeState) {
+                            state.c = currentCodeState;
+                            hasChanges = true;
+                        }
+                    }
+                    
+                    if (outputEl) {
+                        const currentOutputState = outputEl.classList.contains('collapsed') ? 0 : 1;
+                        const initialOutputState = initialState.o;
+                        // Only encode if different from initial state
+                        if (initialOutputState !== undefined && currentOutputState !== initialOutputState) {
+                            state.o = currentOutputState;
+                            hasChanges = true;
+                        }
+                    }
+                    
+                    // Only include cell if it has changes from initial state
+                    if (hasChanges) {
+                        cellStates[cellId] = state;
+                        console.log(`Added cell ${cellId}:`, state);
+                    }
+                }
+            });
+            
+            console.log('Final cell states to encode:', cellStates);
+            
+            // Return empty string if no changed cells
+            if (Object.keys(cellStates).length === 0) return '';
+            
+            // Encode as compact base64 string
+            const encoded = btoa(JSON.stringify(cellStates));
+            console.log('Encoded cell states:', encoded);
+            return encoded;
+        }
+        
+        function decodeCellStatesFromUrl(encodedStates) {
+            if (!encodedStates) return {};
+            
+            try {
+                return JSON.parse(atob(encodedStates));
+            } catch (e) {
+                console.error('Failed to decode cell states:', e);
+                return {};
+            }
+        }
+        
+        function applyCellStatesFromUrl(cellStates) {
+            console.log('Applying cell states from URL:', cellStates);
+            Object.entries(cellStates).forEach(([cellId, state]) => {
+                const codeEl = document.getElementById('code-' + cellId);
+                const outputEl = document.getElementById('output-' + cellId);
+                
+                console.log(`Cell ${cellId}:`, {
+                    codeEl: !!codeEl,
+                    outputEl: !!outputEl,
+                    state: state
+                });
+                
+                if (codeEl && state.c !== undefined) {
+                    if (state.c === 0) {
+                        codeEl.classList.add('collapsed');
+                        console.log(`Collapsed code for cell ${cellId}`, {
+                            hasCollapsedClass: codeEl.classList.contains('collapsed'),
+                            computedDisplay: getComputedStyle(codeEl).display,
+                            classList: Array.from(codeEl.classList),
+                            elementId: codeEl.id
+                        });
+                    } else {
+                        codeEl.classList.remove('collapsed');
+                        codeEl.classList.add('expanded'); // Explicitly add expanded class
+                        console.log(`Expanded code for cell ${cellId}`, {
+                            hasCollapsedClass: codeEl.classList.contains('collapsed'),
+                            hasExpandedClass: codeEl.classList.contains('expanded'),
+                            computedDisplay: getComputedStyle(codeEl).display,
+                            classList: Array.from(codeEl.classList),
+                            elementId: codeEl.id
+                        });
+                    }
+                }
+                
+                if (outputEl && state.o !== undefined) {
+                    if (state.o === 0) {
+                        outputEl.classList.add('collapsed');
+                        console.log(`Collapsed output for cell ${cellId}`);
+                    } else {
+                        outputEl.classList.remove('collapsed');
+                        console.log(`Expanded output for cell ${cellId}`);
+                    }
+                }
+                
+                // Update visual indicators and force style recalculation
+                try {
+                    updateIndicators(cellId);
+                    // Force browser to recalculate styles
+                    if (codeEl) {
+                        codeEl.offsetHeight; // Force reflow
+                        console.log(`After indicators update - code visible: ${getComputedStyle(codeEl).display !== 'none'}`);
+                    }
+                    if (outputEl) {
+                        outputEl.offsetHeight; // Force reflow
+                        console.log(`After indicators update - output visible: ${getComputedStyle(outputEl).display !== 'none'}`);
+                    }
+                } catch (e) {
+                    console.error(`Error updating indicators for cell ${cellId}:`, e);
+                }
+            });
+        }
+
+        function encodeShapesToUrl() {
+            // Encode shapes as compact base64 string
+            if (_shapes.length === 0) return '';
+            
+            const shapeData = _shapes.map(shape => {
+                const baseData = {
+                    ct: shape.createdAt, // creation timestamp
+                    fo: shape.fadeoutTime || getFadeoutTime() // fadeout time for this shape
+                };
+                
+                if (shape.type === 'arrow') {
+                    return {
+                        ...baseData,
+                        t: 'a',
+                        x1: Math.round(shape.x1),
+                        y1: Math.round(shape.y1),
+                        x2: Math.round(shape.x2),
+                        y2: Math.round(shape.y2),
+                        c: shape.color.substring(1), // remove #
+                        w: shape.width
+                    };
+                } else if (shape.type === 'pen') {
+                    return {
+                        ...baseData,
+                        t: 'p',
+                        pts: shape.points.map(p => [Math.round(p.x), Math.round(p.y)]),
+                        c: shape.color.substring(1),
+                        w: shape.width
+                    };
+                } else if (shape.type === 'spotlight') {
+                    return {
+                        ...baseData,
+                        t: 's',
+                        x: Math.round(shape.x),
+                        y: Math.round(shape.y),
+                        r: Math.round(shape.radius)
+                    };
+                }
+            }).filter(Boolean);
+            
+            return btoa(JSON.stringify(shapeData));
+        }
+        
+        function decodeShapesFromUrl(encodedShapes) {
+            if (!encodedShapes) return [];
+            
+            try {
+                const shapeData = JSON.parse(atob(encodedShapes));
+                return shapeData.map(data => {
+                    const base = {
+                        createdAt: data.ct || Date.now(), // use encoded timestamp or current time
+                        fadeoutTime: data.fo || 0, // use encoded fadeout time or 0 (never fade)
+                        opacity: 1.0
+                    };
+                    
+                    if (data.t === 'a') {
+                        return {
+                            ...base,
+                            type: 'arrow',
+                            x1: data.x1,
+                            y1: data.y1,
+                            x2: data.x2,
+                            y2: data.y2,
+                            color: '#' + data.c,
+                            width: data.w
+                        };
+                    } else if (data.t === 'p') {
+                        return {
+                            ...base,
+                            type: 'pen',
+                            points: data.pts.map(([x, y]) => ({ x, y })),
+                            color: '#' + data.c,
+                            width: data.w
+                        };
+                    } else if (data.t === 's') {
+                        return {
+                            ...base,
+                            type: 'spotlight',
+                            x: data.x,
+                            y: data.y,
+                            radius: data.r,
+                            color: '#000000'
+                        };
+                    }
+                }).filter(Boolean);
+            } catch (e) {
+                console.error('Failed to decode shapes:', e);
+                return [];
+            }
+        }
+        
+        function encodeToolStateToUrl() {
+            // Don't update URL during initialization
+            if (_isInitializing) {
+                return window.location.href;
+            }
+            
+            const params = new URLSearchParams(window.location.search);
+            
+            // Check if tools widget is visible and has an active tool
+            const toolsWidget = document.querySelector('.tools-widget');
+            const activeTool = document.body.dataset.tool;
+            const hasActiveTool = activeTool && activeTool !== 'none';
+            const toolsWidgetVisible = toolsWidget && getComputedStyle(toolsWidget).display !== 'none';
+            
+            // Always handle shapes regardless of tool state
+            const encodedShapes = encodeShapesToUrl();
+            if (encodedShapes) {
+                params.set('shapes', encodedShapes);
+            } else {
+                params.delete('shapes');
+            }
+            
+            // Always preserve existing cell states from URL if present
+            const existingCellStates = params.get('cells');
+            if (existingCellStates) {
+                // Keep existing cell states - don't re-encode from DOM
+                params.set('cells', existingCellStates);
+            } else {
+                // Only encode new cell states if none exist in URL
+                const encodedCellStates = encodeCellStatesToUrl();
+                if (encodedCellStates) {
+                    params.set('cells', encodedCellStates);
+                }
+            }
+            
+            if (toolsWidgetVisible && hasActiveTool) {
+                // Include tool params when widget is visible AND tool is active
+                params.set('tools', '1');
+                params.set('tool', activeTool);
+                
+                // Get color (without # prefix)
+                const color = getArrowColor();
+                if (color && color.startsWith('#')) {
+                    params.set('color', color.substring(1));
+                }
+                
+                // Get thickness
+                const thickness = getLineThickness();
+                params.set('thickness', thickness.toString());
+            } else {
+                // Remove tool state params but keep shapes
+                params.delete('tools');
+                params.delete('tool');
+                params.delete('color');
+                params.delete('thickness');
+                params.delete('fadeout');
+            }
+            
+            // Update URL without reloading
+            const newUrl = window.location.pathname + (params.toString() ? '?' + params.toString() : '') + window.location.hash;
+            window.history.replaceState(null, '', newUrl);
+            
+            return window.location.href;
+        }
+
+        function resetLayout() {
+            try {
+                // Clear all uvnote-* keys
+                const allKeys = Object.keys(localStorage);
+                const uvnoteKeys = allKeys.filter(key => key.startsWith('uvnote-'));
+                uvnoteKeys.forEach(k => localStorage.removeItem(k));
+            } catch (_) {}
+            
+            // Clear any active selection and remove URL params
+            try { clearSelection(true); } catch(_) {}
+            // Reset active tool if any
+            try { window.setActiveTool('none'); } catch(_) {}
+            // Clear shapes
+            try { _shapes = []; saveShapes(); } catch(_) {}
+            // Reset URL-loaded tool flag
+            try { _urlLoadedTool = false; } catch(_) {}
+            // Reset all cells to expanded state
+            try {
+                const cells = document.querySelectorAll('.cell');
+                cells.forEach(cell => {
+                    const cellId = cell.id.replace('cell-', '');
+                    const codeEl = document.getElementById('code-' + cellId);
+                    const outputEl = document.getElementById('output-' + cellId);
+                    if (codeEl) codeEl.classList.remove('collapsed');
+                    if (outputEl) outputEl.classList.remove('collapsed');
+                    updateIndicators(cellId);
+                });
+            } catch(_) {}
+            
+            // Clear ALL URL parameters and reload with clean URL
+            try {
+                const cleanUrl = window.location.pathname + window.location.hash;
+                window.location.href = cleanUrl; // Use window.location.href instead of history.replaceState + reload
+            } catch (_) {
+                // Fallback - reload current page
+                location.reload();
+            }
+        }
+
+        function toggleMenu() {
+            const menuButton = document.querySelector('.menu-button');
+            if (menuButton) {
+                menuButton.classList.toggle('active');
+            }
+        }
+
+        function toggleWidget(widgetName) {
+            let widget;
+            let checkbox;
+            
+            // Close the menu first
+            const menuButton = document.querySelector('.menu-button');
+            if (menuButton) {
+                menuButton.classList.remove('active');
+            }
+            
+            switch(widgetName) {
+                case 'tools':
+                    widget = document.querySelector('.tools-widget');
+                    checkbox = document.getElementById('checkbox-tools');
+                    break;
+                case 'file-explorer':
+                    widget = document.querySelector('.file-explorer');
+                    checkbox = document.getElementById('checkbox-file-explorer');
+                    break;
+                case 'minimap':
+                    widget = document.querySelector('.minimap');
+                    checkbox = document.getElementById('checkbox-minimap');
+                    break;
+                case 'status':
+                    widget = document.querySelector('.status-widget');
+                    checkbox = document.getElementById('checkbox-status');
+                    break;
+                default:
+                    return;
+            }
+            
+            if (widget && checkbox) {
+                const isVisible = getComputedStyle(widget).display !== 'none';
+                widget.style.display = isVisible ? 'none' : 'block';
+                checkbox.textContent = isVisible ? '☐' : '☑';
+                
+                // Save state to localStorage
+                try {
+                    localStorage.setItem(`uvnote-widget-${widgetName}`, isVisible ? 'hidden' : 'visible');
+                } catch (_) {}
+                
+                // Re-layout widgets after visibility change
+                try { 
+                    layoutWidgetsStackedBottomRight(); 
+                } catch (_) {}
+                
+                // Update URL when tools widget visibility changes
+                if (widgetName === 'tools') {
+                    encodeToolStateToUrl();
+                }
+            }
+        }
+        
+        function initializeWidgetVisibility() {
+            const widgets = [
+                { name: 'tools', selector: '.tools-widget' },
+                { name: 'file-explorer', selector: '.file-explorer' },
+                { name: 'minimap', selector: '.minimap' },
+                { name: 'status', selector: '.status-widget' }
+            ];
+            
+            widgets.forEach(({ name, selector }) => {
+                const defaultState = name === 'status' ? 'visible' : 'hidden';
+                const savedState = localStorage.getItem(`uvnote-widget-${name}`) || defaultState;
+                const widget = document.querySelector(selector);
+                const checkbox = document.getElementById(`checkbox-${name}`);
+                
+                if (widget && checkbox) {
+                    const isVisible = savedState === 'visible';
+                    widget.style.display = isVisible ? 'block' : 'none';
+                    checkbox.textContent = isVisible ? '☑' : '☐';
+                }
+            });
+        }
+
+        // Close menu when clicking outside
+        document.addEventListener('click', function(event) {
+            const menuButton = document.querySelector('.menu-button');
+            // Don't close if clicking on a menu item (let the item handler close it)
+            if (menuButton && !menuButton.contains(event.target)) {
+                menuButton.classList.remove('active');
+            }
+        });
+
+        // Layout: stack widgets bottom-right and equalize widths
+        function hasCustomWidgetPositions() {
+            try {
+                return (
+                    localStorage.getItem('uvnote-minimap-pos') ||
+                    localStorage.getItem('uvnote-file-explorer-pos') ||
+                    localStorage.getItem('uvnote-tools-pos')
+                );
+            } catch (_) { return false; }
+        }
+
+        function rectsOverlap(r1, r2) {
+            return !(r1.right <= r2.left || r2.right <= r1.left || r1.bottom <= r2.top || r2.bottom <= r1.top);
+        }
+
+        function widgetsOverlap(widgets) {
+            for (let i = 0; i < widgets.length; i++) {
+                const a = widgets[i];
+                const ra = a.getBoundingClientRect();
+                for (let j = i + 1; j < widgets.length; j++) {
+                    const b = widgets[j];
+                    const rb = b.getBoundingClientRect();
+                    if (rectsOverlap(ra, rb)) return true;
+                }
+            }
+            return false;
+        }
+
+        function applyStackLayout(widgets, order) {
+            if (!widgets.length) return;
+            // Fixed equal width
+            const fixedWidth = 220;
+            widgets.forEach(el => { el.style.width = fixedWidth + 'px'; });
+
+            // Fit heights if needed to avoid overflow
+            const gap = 12;
+            const available = Math.max(0, window.innerHeight - 40 - gap * (order.length - 1));
+            const eachMax = Math.floor(available / order.length);
+            order.forEach(el => {
+                el.style.maxHeight = eachMax + 'px';
+                el.style.overflowY = 'auto';
+            });
+
+            // Stack bottom-up in the requested order
+            let bottomOffset = 20; // base gutter
+            order.forEach(el => {
+                el.style.left = 'auto';
+                el.style.top = 'auto';
+                el.style.right = '20px';
+                el.style.bottom = bottomOffset + 'px';
+                bottomOffset += el.offsetHeight + gap;
+            });
+        }
+
+        function layoutWidgetsStackedBottomRight() {
+            const minimap = document.querySelector('.minimap');
+            const fileExplorer = document.querySelector('.file-explorer');
+            const tools = document.querySelector('.tools-widget');
+            const status = document.querySelector('.status-widget');
+            const widgets = [minimap, fileExplorer, tools, status].filter(el => el && getComputedStyle(el).display !== 'none');
+            if (!widgets.length) return;
+
+            const order = [minimap, fileExplorer, tools, status].filter(Boolean).filter(el => getComputedStyle(el).display !== 'none');
+
+            // If user placed custom positions and there is no overlap, respect them.
+            if (hasCustomWidgetPositions() && !widgetsOverlap(widgets)) return;
+
+            applyStackLayout(widgets, order);
+        }
+        
+        // Panel icon removed
+        
+        let _minimapScrollContainer = null;
+        let _minimapScrollHandler = null;
+        function initMinimap() {
+            // Generate minimap content
+            const minimap = createMinimap();
+            document.body.appendChild(minimap);
+            // Make draggable (use title as handle)
+            const mTitle = minimap.querySelector('.minimap-title');
+            makeDraggable(minimap, 'uvnote-minimap-pos', mTitle);
+
+            // Attach scroll listener to window (two-panel removed)
+            _minimapScrollContainer = window;
+
+            if (_minimapScrollContainer) {
+                _minimapScrollHandler = () => updateMinimapActive();
+                if (_minimapScrollContainer === window) {
+                    window.addEventListener('scroll', _minimapScrollHandler);
+                } else {
+                    _minimapScrollContainer.addEventListener('scroll', _minimapScrollHandler);
+                }
+            }
+            updateMinimapActive();
+        }
+
+        function teardownMinimap() {
+            const minimap = document.querySelector('.minimap');
+            if (minimap && minimap.parentNode) minimap.parentNode.removeChild(minimap);
+            if (_minimapScrollContainer && _minimapScrollHandler) {
+                if (_minimapScrollContainer === window) {
+                    window.removeEventListener('scroll', _minimapScrollHandler);
+                } else {
+                    _minimapScrollContainer.removeEventListener('scroll', _minimapScrollHandler);
+                }
+            }
+            _minimapScrollContainer = null;
+            _minimapScrollHandler = null;
+        }
+        
+        function initFileExplorer() {
+            // Generate file explorer content
+            const fileExplorer = createFileExplorer();
+            document.body.appendChild(fileExplorer);
+        }
+        
+        function createMinimap() {
+            const minimap = document.createElement('div');
+            minimap.className = 'minimap';
+            
+            const title = document.createElement('div');
+            title.className = 'minimap-title';
+            title.textContent = 'navigation';
+            minimap.appendChild(title);
+            
+            // Find all headings and cells
+            const root = document.querySelector('.main-content') || document;
+            const headings = root.querySelectorAll('h1, h2, h3, h4, h5, h6');
+            const cells = root.querySelectorAll('.cell');
+            
+            // Combine and sort by position
+            const items = [];
+            
+            headings.forEach(heading => {
+                const id = heading.id || generateId(heading.textContent);
+                if (!heading.id) heading.id = id;
+                
+                items.push({
+                    element: heading,
+                    type: 'heading',
+                    level: parseInt(heading.tagName.charAt(1)),
+                    text: heading.textContent.trim(),
+                    id: id,
+                    position: heading.getBoundingClientRect().top + window.scrollY
+                });
+            });
+            
+            cells.forEach(cell => {
+                const header = cell.querySelector('.cell-header');
+                if (header) {
+                    const id = cell.id || `cell-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`;
+                    if (!cell.id) cell.id = id;
+                    
+                    items.push({
+                        element: cell,
+                        type: 'cell',
+                        text: header.textContent.trim(),
+                        id: id,
+                        position: cell.getBoundingClientRect().top + window.scrollY
+                    });
+                }
+            });
+            
+            // Sort by position
+            items.sort((a, b) => a.position - b.position);
+            
+            // Create minimap items
+            items.forEach(item => {
+                const link = document.createElement('a');
+                link.className = `minimap-item ${item.type === 'heading' ? 'minimap-heading' : 'minimap-cell'}`;
+                if (item.type === 'heading') {
+                    link.classList.add(`h${item.level}`);
+                }
+                link.textContent = item.text.length > 25 ? item.text.substring(0, 22) + '...' : item.text;
+                link.href = `#${item.id}`;
+                link.onclick = function(e) {
+                    e.preventDefault();
+                    item.element.scrollIntoView({ behavior: 'smooth', block: 'start' });
+                };
+                minimap.appendChild(link);
+            });
+            
+            return minimap;
+        }
+        
+        function generateId(text) {
+            return text.toLowerCase()
+                .replace(/[^a-z0-9]+/g, '-')
+                .replace(/^-+|-+$/g, '')
+                .substring(0, 20);
+        }
+        
+        function updateMinimapActive() {
+            const minimapItems = document.querySelectorAll('.minimap-item');
+            const container = _minimapScrollContainer || window;
+            const containerRect = container === window ? null : container.getBoundingClientRect();
+            const scrollPos = (container === window ? window.scrollY : container.scrollTop) + 100; // Offset for better detection
+            
+            let activeItem = null;
+            minimapItems.forEach(item => {
+                const targetId = item.getAttribute('href').substring(1);
+                const target = document.getElementById(targetId);
+                
+                if (target) {
+                    const rectTop = target.getBoundingClientRect().top;
+                    const targetPos = (container === window)
+                        ? rectTop + window.scrollY
+                        : rectTop - containerRect.top + container.scrollTop;
+                    if (targetPos <= scrollPos) {
+                        activeItem = item;
+                    }
+                }
+                
+                item.classList.remove('active');
+            });
+            
+            if (activeItem) {
+                activeItem.classList.add('active');
+            }
+        }
+        
+        function createFileExplorer() {
+            const fileExplorer = document.createElement('div');
+            fileExplorer.className = 'file-explorer';
+            
+            const title = document.createElement('div');
+            title.className = 'file-explorer-title';
+            title.textContent = 'files';
+            fileExplorer.appendChild(title);
+            // Make draggable (use title as handle)
+            makeDraggable(fileExplorer, 'uvnote-file-explorer-pos', title);
+            
+            // Scripts section
+            const scriptsSection = document.createElement('div');
+            scriptsSection.className = 'file-explorer-section';
+            
+            const scriptsTitle = document.createElement('div');
+            scriptsTitle.className = 'file-explorer-section-title';
+            scriptsTitle.textContent = 'scripts';
+            scriptsSection.appendChild(scriptsTitle);
+            
+            // Find all cells and list their script files (single panel)
+            const root = document.querySelector('.main-content') || document;
+            const cells = root.querySelectorAll('.cell');
+            cells.forEach(cell => {
+                const header = cell.querySelector('.cell-header');
+                if (header) {
+                    const cellText = header.textContent.trim();
+                    const cellMatch = cellText.match(/Cell: ([a-zA-Z_][a-zA-Z0-9_]*)/);
+                    if (cellMatch) {
+                        const cellId = cellMatch[1];
+                        const scriptItem = document.createElement('div');
+                        scriptItem.className = 'file-explorer-item script';
+                        scriptItem.textContent = `${cellId}.py`;
+                        scriptItem.onclick = function() {
+                            cell.scrollIntoView({ behavior: 'smooth', block: 'start' });
+                        };
+                        scriptsSection.appendChild(scriptItem);
+                    }
+                }
+            });
+            
+            fileExplorer.appendChild(scriptsSection);
+            
+            // Artifacts section
+            const artifactsSection = document.createElement('div');
+            artifactsSection.className = 'file-explorer-section';
+            
+            const artifactsTitle = document.createElement('div');
+            artifactsTitle.className = 'file-explorer-section-title';
+            artifactsTitle.textContent = 'artifacts';
+            artifactsSection.appendChild(artifactsTitle);
+            
+            // Find all artifact links (single panel)
+            const artifactsRoot = document.querySelector('.main-content') || document;
+            const artifacts = artifactsRoot.querySelectorAll('.artifact');
+            if (artifacts.length === 0) {
+                const noArtifacts = document.createElement('div');
+                noArtifacts.className = 'file-explorer-item artifact';
+                noArtifacts.textContent = '(none)';
+                noArtifacts.style.opacity = '0.5';
+                artifactsSection.appendChild(noArtifacts);
+            } else {
+                artifacts.forEach(artifact => {
+                    const artifactItem = document.createElement('div');
+                    artifactItem.className = 'file-explorer-item artifact';
+                    artifactItem.textContent = artifact.textContent;
+                    artifactItem.onclick = function() {
+                        artifact.click();
+                    };
+                    artifactsSection.appendChild(artifactItem);
+                });
+            }
+            
+            fileExplorer.appendChild(artifactsSection);
+            
+            return fileExplorer;
+        }
+
+        function initStatusWidget() {
+            let el = document.querySelector('.status-widget');
+            if (!el) {
+                el = document.createElement('div');
+                el.className = 'status-widget';
+                el.id = 'status-widget';
+                el.textContent = 'ready — Esc';
+                document.body.appendChild(el);
+            }
+        }
+
+        // Tools widget
+        let _cursorX = 0;
+        let _cursorY = 0;
+        let _cursorVisible = false;
+        
+        function setActiveTool(tool) {
+            if (!tool || tool === 'none') {
+                document.body.dataset.tool = 'none';
+                localStorage.setItem('uvnote-active-tool', 'none');
+                setOverlayActive(false);
+                _cursorVisible = false;
+                // Remove active class from all tool buttons when deactivating
+                const toolButtons = document.querySelectorAll('.tools-widget .tool-button');
+                toolButtons.forEach(btn => btn.classList.remove('active'));
+                updateStateIndicator();
+                encodeToolStateToUrl();
+                return;
+            }
+            document.body.dataset.tool = tool;
+            localStorage.setItem('uvnote-active-tool', tool);
+            setOverlayActive(true);
+            _cursorVisible = true;
+            updateStateIndicator();
+            encodeToolStateToUrl();
+        }
+
+        // Make setActiveTool globally accessible for ESC key handler
+        window.setActiveTool = setActiveTool;
+        
+
+        function getArrowColor() {
+            const saved = localStorage.getItem('uvnote-arrow-color');
+            if (saved) return saved;
+            return '#e53935'; // Default red color
+        }
+
+        function setStoredArrowColor(color) {
+            try { localStorage.setItem('uvnote-arrow-color', color); } catch (_) {}
+        }
+
+        function getLineThickness() {
+            const saved = localStorage.getItem('uvnote-line-thickness');
+            if (saved) return parseInt(saved, 10);
+            return 6; // default thickness
+        }
+
+        function setStoredLineThickness(thickness) {
+            try { localStorage.setItem('uvnote-line-thickness', thickness); } catch (_) {}
+        }
+
+        function getFadeoutTime() {
+            const saved = localStorage.getItem('uvnote-fadeout-time');
+            if (saved) return parseInt(saved, 10);
+            return 5; // default 5 seconds
+        }
+
+        function setStoredFadeoutTime(seconds) {
+            try { localStorage.setItem('uvnote-fadeout-time', seconds); } catch (_) {}
+        }
+
+        function createToolsWidget() {
+            const tools = document.createElement('div');
+            tools.className = 'tools-widget';
+
+            const title = document.createElement('div');
+            title.className = 'tools-title';
+            title.textContent = 'tools';
+            tools.appendChild(title);
+
+            const row = document.createElement('div');
+            row.className = 'tools-row';
+            tools.appendChild(row);
+
+            // Arrow tool
+            const arrowBtn = document.createElement('div');
+            arrowBtn.className = 'tool-button';
+            arrowBtn.textContent = 'arrow';
+            arrowBtn.onclick = function() {
+                const isActive = arrowBtn.classList.contains('active');
+                if (isActive) {
+                    arrowBtn.classList.remove('active');
+                    setActiveTool('none');
+                } else {
+                    tools.querySelectorAll('.tool-button').forEach(b => b.classList.remove('active'));
+                    arrowBtn.classList.add('active');
+                    setActiveTool('arrow');
+                }
+            };
+            row.appendChild(arrowBtn);
+
+            // Pen tool
+            const penBtn = document.createElement('div');
+            penBtn.className = 'tool-button';
+            penBtn.textContent = 'pen';
+            penBtn.onclick = function() {
+                const isActive = penBtn.classList.contains('active');
+                if (isActive) {
+                    penBtn.classList.remove('active');
+                    setActiveTool('none');
+                } else {
+                    tools.querySelectorAll('.tool-button').forEach(b => b.classList.remove('active'));
+                    penBtn.classList.add('active');
+                    setActiveTool('pen');
+                }
+            };
+            row.appendChild(penBtn);
+
+            // Eraser tool
+            const eraseBtn = document.createElement('div');
+            eraseBtn.className = 'tool-button';
+            eraseBtn.textContent = 'eraser';
+            eraseBtn.onclick = function() {
+                const isActive = eraseBtn.classList.contains('active');
+                if (isActive) {
+                    eraseBtn.classList.remove('active');
+                    setActiveTool('none');
+                } else {
+                    tools.querySelectorAll('.tool-button').forEach(b => b.classList.remove('active'));
+                    eraseBtn.classList.add('active');
+                    setActiveTool('eraser');
+                }
+            };
+            row.appendChild(eraseBtn);
+
+            // Spotlight tool
+            const spotlightBtn = document.createElement('div');
+            spotlightBtn.className = 'tool-button';
+            spotlightBtn.textContent = 'spotlight';
+            spotlightBtn.onclick = function() {
+                const isActive = spotlightBtn.classList.contains('active');
+                if (isActive) {
+                    spotlightBtn.classList.remove('active');
+                    setActiveTool('none');
+                } else {
+                    tools.querySelectorAll('.tool-button').forEach(b => b.classList.remove('active'));
+                    spotlightBtn.classList.add('active');
+                    setActiveTool('spotlight');
+                }
+            };
+            row.appendChild(spotlightBtn);
+
+            // Clear all
+            const clearBtn = document.createElement('div');
+            clearBtn.className = 'tool-button';
+            clearBtn.textContent = 'clear';
+            clearBtn.onclick = function() {
+                _shapes = [];
+                saveShapes();
+                renderOverlay();
+            };
+            row.appendChild(clearBtn);
+            
+            // We'll add the copy button at the end of the widget
+
+            // Restore active state from storage
+            const saved = localStorage.getItem('uvnote-active-tool') || 'none';
+            if (saved === 'arrow') {
+                arrowBtn.classList.add('active');
+                setActiveTool('arrow');
+            } else if (saved === 'pen') {
+                penBtn.classList.add('active');
+                setActiveTool('pen');
+            } else if (saved === 'eraser') {
+                eraseBtn.classList.add('active');
+                setActiveTool('eraser');
+            } else if (saved === 'spotlight') {
+                spotlightBtn.classList.add('active');
+                setActiveTool('spotlight');
+            }
+
+            // Color selector
+            const colorTitle = document.createElement('div');
+            colorTitle.className = 'tools-section-title';
+            colorTitle.textContent = 'color';
+            tools.appendChild(colorTitle);
+
+            const colorRow = document.createElement('div');
+            colorRow.className = 'tools-row color-row';
+            tools.appendChild(colorRow);
+
+            const swatchColors = [
+                // Primary colors
+                '#e53935', '#fb8c00', '#fdd835', '#43a047', '#1e88e5', '#8e24aa',
+                // Additional useful colors  
+                '#ff5722', '#795548', '#607d8b', '#9c27b0',
+                // Grayscale
+                '#000000', '#424242', '#9e9e9e', '#ffffff'
+            ];
+            const swatches = [];
+            swatchColors.forEach(c => {
+                const s = document.createElement('div');
+                s.className = 'color-swatch';
+                s.style.backgroundColor = c;
+                s.title = c;
+                s.onclick = () => {
+                    setStoredArrowColor(c);
+                    refreshColorUI(c);
+                    if (_cursorVisible) renderOverlay();
+                    encodeToolStateToUrl();
+                };
+                colorRow.appendChild(s);
+                swatches.push(s);
+            });
+
+            const colorInput = document.createElement('input');
+            colorInput.type = 'color';
+            colorInput.className = 'color-input';
+            colorInput.oninput = () => {
+                setStoredArrowColor(colorInput.value);
+                refreshColorUI(colorInput.value);
+                if (_cursorVisible) renderOverlay();
+                encodeToolStateToUrl();
+            };
+            colorRow.appendChild(colorInput);
+
+            function refreshColorUI(selected) {
+                const selectedHex = selected.startsWith('#') ? selected.toLowerCase() : rgbToHex(selected);
+                
+                swatches.forEach((s, i) => {
+                    const swatchHex = swatchColors[i].toLowerCase();
+                    if (swatchHex === selectedHex) {
+                        s.classList.add('selected');
+                    } else {
+                        s.classList.remove('selected');
+                    }
+                });
+                
+                try { 
+                    colorInput.value = selectedHex; 
+                } catch (_) {}
+            }
+
+            function rgbToHex(rgb) {
+                const m = rgb.match(/rgba?\((\d+),\s*(\d+),\s*(\d+)\)/i);
+                if (!m) return '#000000';
+                const r = parseInt(m[1]).toString(16).padStart(2, '0');
+                const g = parseInt(m[2]).toString(16).padStart(2, '0');
+                const b = parseInt(m[3]).toString(16).padStart(2, '0');
+                return `#${r}${g}${b}`;
+            }
+
+            // Restore color selection
+            refreshColorUI(getArrowColor());
+
+            // Thickness slider
+            const thicknessTitle = document.createElement('div');
+            thicknessTitle.className = 'tools-section-title';
+            thicknessTitle.textContent = 'thickness';
+            tools.appendChild(thicknessTitle);
+
+            const thicknessRow = document.createElement('div');
+            thicknessRow.className = 'thickness-row';
+            tools.appendChild(thicknessRow);
+
+            const thicknessSlider = document.createElement('input');
+            thicknessSlider.type = 'range';
+            thicknessSlider.className = 'thickness-slider';
+            thicknessSlider.min = '1';
+            thicknessSlider.max = '10';
+            thicknessSlider.value = getLineThickness();
+            
+            const thicknessValue = document.createElement('span');
+            thicknessValue.className = 'thickness-value';
+            thicknessValue.textContent = thicknessSlider.value + 'px';
+
+            thicknessSlider.oninput = function() {
+                const value = parseInt(thicknessSlider.value, 10);
+                setStoredLineThickness(value);
+                thicknessValue.textContent = value + 'px';
+                if (_cursorVisible) renderOverlay();
+                encodeToolStateToUrl();
+            };
+
+            thicknessRow.appendChild(thicknessSlider);
+            thicknessRow.appendChild(thicknessValue);
+
+            // Fadeout time slider
+            const fadeoutTitle = document.createElement('div');
+            fadeoutTitle.className = 'tools-section-title';
+            fadeoutTitle.textContent = 'fadeout time';
+            tools.appendChild(fadeoutTitle);
+
+            const fadeoutRow = document.createElement('div');
+            fadeoutRow.className = 'thickness-row';
+            tools.appendChild(fadeoutRow);
+
+            const fadeoutSlider = document.createElement('input');
+            fadeoutSlider.type = 'range';
+            fadeoutSlider.className = 'thickness-slider';
+            fadeoutSlider.min = '0';
+            fadeoutSlider.max = '30';
+            fadeoutSlider.value = getFadeoutTime();
+            
+            const fadeoutValue = document.createElement('span');
+            fadeoutValue.className = 'thickness-value';
+            fadeoutValue.textContent = fadeoutSlider.value === '0' ? 'never' : fadeoutSlider.value + 's';
+
+            fadeoutSlider.oninput = function() {
+                const value = parseInt(fadeoutSlider.value, 10);
+                setStoredFadeoutTime(value);
+                fadeoutValue.textContent = value === 0 ? 'never' : value + 's';
+                encodeToolStateToUrl();
+            };
+
+            fadeoutRow.appendChild(fadeoutSlider);
+            fadeoutRow.appendChild(fadeoutValue);
+
+            // Draggable behavior
+            makeDraggable(tools, 'uvnote-tools-pos', title);
+
+            return tools;
+        }
+
+        function initTools() {
+            const widget = createToolsWidget();
+            document.body.appendChild(widget);
+        }
+
+        function teardownTools() {
+            const w = document.querySelector('.tools-widget');
+            if (w && w.parentNode) w.parentNode.removeChild(w);
+        }
+
+        // --- Canvas overlay for tools ---
+        let _overlay = null;
+        let _overlayCtx = null;
+        let _overlayContainer = null; // window
+        let _overlayMode = 'single';
+        let _overlayResizeHandler = null;
+        let _overlayScrollHandler = null;
+        let _drawing = null; // current in-progress arrow {x1,y1,x2,y2}
+        let _shapes = []; // committed shapes for current mode
+        let _fadeTimer = null; // timer for fade animation
+        let _urlLoadedTool = false; // track if tool was loaded from URL
+        let _isInitializing = true; // prevent URL updates during initialization
+        let _initialCellStates = {}; // track initial cell states from page load
+
+        function getOverlayStorageKey() { return 'uvnote-shapes'; }
+
+        function loadShapes() {
+            try {
+                const raw = localStorage.getItem(getOverlayStorageKey());
+                _shapes = raw ? JSON.parse(raw) : [];
+            } catch (_) { _shapes = []; }
+        }
+
+        function saveShapes() {
+            try { 
+                localStorage.setItem(getOverlayStorageKey(), JSON.stringify(_shapes));
+                // Always update URL when shapes change
+                encodeToolStateToUrl();
+            } catch (_) {}
+        }
+
+        function updateShapesFade() {
+            const now = Date.now();
+            let needsUpdate = false;
+
+            for (let i = _shapes.length - 1; i >= 0; i--) {
+                const shape = _shapes[i];
+                if (!shape.createdAt) continue; // Skip old shapes without timestamps
+                
+                // Use individual shape's fadeout time, or global if not set
+                const shapesFadeoutSeconds = shape.fadeoutTime !== undefined ? shape.fadeoutTime : getFadeoutTime();
+                
+                // Skip fading if fadeout is disabled for this shape
+                if (shapesFadeoutSeconds === 0) continue;
+                
+                const fadeStartTime = Math.max(0, (shapesFadeoutSeconds - 2) * 1000); // Start fading 2s before end
+                const fadeEndTime = shapesFadeoutSeconds * 1000; // Fully gone after specified time
+                const age = now - shape.createdAt;
+                
+                if (age >= fadeEndTime) {
+                    // Remove completely faded shapes
+                    _shapes.splice(i, 1);
+                    needsUpdate = true;
+                } else if (age >= fadeStartTime) {
+                    // Update opacity for fading shapes
+                    const fadeProgress = (age - fadeStartTime) / (fadeEndTime - fadeStartTime);
+                    const newOpacity = 1 - fadeProgress;
+                    if (Math.abs(shape.opacity - newOpacity) > 0.01) {
+                        shape.opacity = newOpacity;
+                        needsUpdate = true;
+                    }
+                }
+            }
+
+            if (needsUpdate) {
+                saveShapes();
+                renderOverlay();
+                // Update URL to remove faded shapes  
+                encodeToolStateToUrl();
+            }
+        }
+
+        function getContentContainer() { return window; }
+
+        function updateOverlayModeAndContainer() {
+            _overlayContainer = window;
+            _overlayMode = 'single';
+        }
+
+        function updateOverlayBounds() {
+            if (!_overlay) return;
+            if (_overlayContainer === window) {
+                _overlay.style.position = 'fixed';
+                _overlay.style.left = '0px';
+                _overlay.style.top = '0px';
+                _overlay.width = window.innerWidth;
+                _overlay.height = window.innerHeight;
+            } else {
+                const rect = _overlayContainer.getBoundingClientRect();
+                _overlay.style.position = 'fixed';
+                _overlay.style.left = rect.left + 'px';
+                _overlay.style.top = rect.top + 'px';
+                _overlay.width = Math.max(0, Math.floor(rect.width));
+                _overlay.height = Math.max(0, Math.floor(rect.height));
+            }
+            renderOverlay();
+        }
+
+        function containerScrollLeft() {
+            return (_overlayContainer === window) ? (window.scrollX || 0) : (_overlayContainer.scrollLeft || 0);
+        }
+        function containerScrollTop() {
+            return (_overlayContainer === window) ? (window.scrollY || 0) : (_overlayContainer.scrollTop || 0);
+        }
+
+        function toCanvasCoords(clientX, clientY) {
+            const rect = _overlay.getBoundingClientRect();
+            return { x: clientX - rect.left, y: clientY - rect.top };
+        }
+
+        function onPointerDown(e) {
+            const tool = document.body.dataset.tool;
+            if (tool === 'arrow') {
+                startDrawArrow(e);
+            } else if (tool === 'pen') {
+                startDrawPen(e);
+            } else if (tool === 'eraser') {
+                eraseAt(e);
+            } else if (tool === 'spotlight') {
+                startDrawSpotlight(e);
+            }
+        }
+
+        function onPointerMove(e) {
+            // Update cursor position
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _cursorX = pt.x;
+            _cursorY = pt.y;
+            
+            if (!_drawing) {
+                // Just update cursor position and re-render
+                if (_cursorVisible) {
+                    renderOverlay();
+                }
+                return;
+            }
+            
+            if (_drawing.type === 'pen') {
+                moveDrawPen(e);
+            } else if (_drawing.type === 'spotlight') {
+                moveDrawSpotlight(e);
+            } else {
+                moveDrawArrow(e);
+            }
+        }
+        
+        function onPointerEnter(e) {
+            _cursorVisible = document.body.dataset.tool !== 'none';
+            if (_cursorVisible) {
+                renderOverlay();
+            }
+        }
+        
+        function onPointerLeave(e) {
+            _cursorVisible = false;
+            renderOverlay();
+        }
+
+        function onPointerUp(e) {
+            if (!_drawing) return;
+            if (_drawing.type === 'pen') {
+                endDrawPen();
+            } else if (_drawing.type === 'spotlight') {
+                endDrawSpotlight();
+            } else {
+                endDrawArrow();
+            }
+        }
+
+        function startDrawArrow(e) {
+            if (document.body.dataset.tool !== 'arrow') return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _drawing = {
+                x1: pt.x + containerScrollLeft(),
+                y1: pt.y + containerScrollTop(),
+                x2: pt.x + containerScrollLeft(),
+                y2: pt.y + containerScrollTop(),
+                color: getArrowColor(),
+                width: getLineThickness()
+            };
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function moveDrawArrow(e) {
+            if (!_drawing) return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _drawing.x2 = pt.x + containerScrollLeft();
+            _drawing.y2 = pt.y + containerScrollTop();
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function endDrawArrow() {
+            if (!_drawing) return;
+            _shapes.push({ 
+                type: 'arrow', 
+                ..._drawing,
+                createdAt: Date.now(),
+                fadeoutTime: getFadeoutTime(),
+                opacity: 1.0
+            });
+            _drawing = null;
+            saveShapes();
+            renderOverlay();
+        }
+
+        function startDrawPen(e) {
+            if (document.body.dataset.tool !== 'pen') return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _drawing = {
+                type: 'pen',
+                points: [{
+                    x: pt.x + containerScrollLeft(),
+                    y: pt.y + containerScrollTop()
+                }],
+                color: getArrowColor(),
+                width: getLineThickness()
+            };
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function moveDrawPen(e) {
+            if (!_drawing || _drawing.type !== 'pen') return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _drawing.points.push({
+                x: pt.x + containerScrollLeft(),
+                y: pt.y + containerScrollTop()
+            });
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function endDrawPen() {
+            if (!_drawing || _drawing.type !== 'pen') return;
+            if (_drawing.points.length > 1) {
+                _shapes.push({ 
+                    ..._drawing,
+                    createdAt: Date.now(),
+                    fadeoutTime: getFadeoutTime(),
+                    opacity: 1.0
+                });
+            }
+            _drawing = null;
+            saveShapes();
+            renderOverlay();
+        }
+
+        function startDrawSpotlight(e) {
+            if (document.body.dataset.tool !== 'spotlight') return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _drawing = {
+                type: 'spotlight',
+                x: pt.x + containerScrollLeft(),
+                y: pt.y + containerScrollTop(),
+                radius: getLineThickness() * 20, // Use thickness to control spotlight size (bigger default)
+                color: getArrowColor()
+            };
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function moveDrawSpotlight(e) {
+            if (!_drawing || _drawing.type !== 'spotlight') return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            const dx = pt.x + containerScrollLeft() - _drawing.x;
+            const dy = pt.y + containerScrollTop() - _drawing.y;
+            _drawing.radius = Math.max(20, Math.sqrt(dx * dx + dy * dy)); // Minimum radius of 20
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function endDrawSpotlight() {
+            if (!_drawing || _drawing.type !== 'spotlight') return;
+            _shapes.push({ 
+                ..._drawing,
+                createdAt: Date.now(),
+                fadeoutTime: getFadeoutTime(),
+                opacity: 1.0
+            });
+            _drawing = null;
+            saveShapes();
+            renderOverlay();
+        }
+
+        function distPointToSegment(px, py, x1, y1, x2, y2) {
+            const dx = x2 - x1, dy = y2 - y1;
+            if (dx === 0 && dy === 0) return Math.hypot(px - x1, py - y1);
+            const t = Math.max(0, Math.min(1, ((px - x1) * dx + (py - y1) * dy) / (dx*dx + dy*dy)));
+            const cx = x1 + t * dx, cy = y1 + t * dy;
+            return Math.hypot(px - cx, py - cy);
+        }
+
+        function eraseAt(e) {
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            const x = pt.x + containerScrollLeft();
+            const y = pt.y + containerScrollTop();
+            const threshold = 10; // pixels
+            for (let i = _shapes.length - 1; i >= 0; i--) {
+                const s = _shapes[i];
+                if (s.type === 'arrow') {
+                    const d = distPointToSegment(x, y, s.x1, s.y1, s.x2, s.y2);
+                    if (d <= threshold) {
+                        _shapes.splice(i, 1);
+                        saveShapes();
+                        renderOverlay();
+                        break;
+                    }
+                } else if (s.type === 'pen' && s.points) {
+                    // Check if click is near any line segment in the pen stroke
+                    let minDist = Infinity;
+                    for (let j = 1; j < s.points.length; j++) {
+                        const d = distPointToSegment(x, y, s.points[j-1].x, s.points[j-1].y, s.points[j].x, s.points[j].y);
+                        minDist = Math.min(minDist, d);
+                    }
+                    if (minDist <= threshold) {
+                        _shapes.splice(i, 1);
+                        saveShapes();
+                        renderOverlay();
+                        break;
+                    }
+                }
+            }
+            e.preventDefault();
+        }
+
+        function drawArrow(ctx, x1, y1, x2, y2, color, width, opacity = 1.0) {
+            // Set opacity
+            const oldAlpha = ctx.globalAlpha;
+            ctx.globalAlpha = opacity;
+            
+            ctx.strokeStyle = color;
+            ctx.fillStyle = color;
+            ctx.lineWidth = width;
+            ctx.lineCap = 'round';
+            ctx.lineJoin = 'round';
+            
+            // Check if points are too close (initial state)
+            const dx = x2 - x1;
+            const dy = y2 - y1;
+            const distance = Math.sqrt(dx * dx + dy * dy);
+            
+            if (distance < 5) {
+                // Draw just a small arrowhead pointing down-right when first clicked
+                const defaultAngle = Math.PI / 4; // 45 degrees (down-right)
+                const headLength = Math.min(15 + width * 1.5, 25);
+                const headAngle = Math.PI / 6;
+                
+                // Calculate arrowhead points
+                const hx1 = x1 + headLength * Math.cos(defaultAngle - headAngle);
+                const hy1 = y1 + headLength * Math.sin(defaultAngle - headAngle);
+                const hx2 = x1 + headLength * Math.cos(defaultAngle + headAngle);
+                const hy2 = y1 + headLength * Math.sin(defaultAngle + headAngle);
+                
+                // Draw arrowhead only
+                ctx.beginPath();
+                ctx.moveTo(x1, y1);
+                ctx.lineTo(hx1, hy1);
+                ctx.lineTo(hx2, hy2);
+                ctx.closePath();
+                ctx.fill();
+            } else {
+                // Normal arrow drawing - head at x1,y1, tail at x2,y2
+                const angle = Math.atan2(y1 - y2, x1 - x2);
+                const headLength = Math.min(15 + width * 1.5, 25);
+                const headAngle = Math.PI / 6;
+                
+                // Calculate where the line should end (before the arrowhead)
+                const lineEndX = x1 - headLength * 0.8 * Math.cos(angle);
+                const lineEndY = y1 - headLength * 0.8 * Math.sin(angle);
+                
+                // Draw the line from tail to near the head
+                ctx.beginPath();
+                ctx.moveTo(x2, y2);
+                ctx.lineTo(lineEndX, lineEndY);
+                ctx.stroke();
+                
+                // Calculate arrowhead points
+                const hx1 = x1 - headLength * Math.cos(angle - headAngle);
+                const hy1 = y1 - headLength * Math.sin(angle - headAngle);
+                const hx2 = x1 - headLength * Math.cos(angle + headAngle);
+                const hy2 = y1 - headLength * Math.sin(angle + headAngle);
+                
+                // Draw arrowhead
+                ctx.beginPath();
+                ctx.moveTo(x1, y1);
+                ctx.lineTo(hx1, hy1);
+                ctx.lineTo(hx2, hy2);
+                ctx.closePath();
+                ctx.fill();
+            }
+            
+            // Restore opacity
+            ctx.globalAlpha = oldAlpha;
+        }
+
+        function drawPen(ctx, points, color, width, offX, offY, opacity = 1.0) {
+            if (!points || points.length < 2) return;
+            
+            // Set opacity
+            const oldAlpha = ctx.globalAlpha;
+            ctx.globalAlpha = opacity;
+            
+            ctx.strokeStyle = color;
+            ctx.lineWidth = width;
+            ctx.lineCap = 'round';
+            ctx.lineJoin = 'round';
+            ctx.beginPath();
+            ctx.moveTo(points[0].x - offX, points[0].y - offY);
+            for (let i = 1; i < points.length; i++) {
+                ctx.lineTo(points[i].x - offX, points[i].y - offY);
+            }
+            ctx.stroke();
+            
+            // Restore opacity
+            ctx.globalAlpha = oldAlpha;
+        }
+
+        function drawAllSpotlights(ctx, spotlights, offX, offY) {
+            if (!spotlights || spotlights.length === 0) return;
+            
+            ctx.save();
+            
+            // Calculate the overall opacity based on all spotlights
+            const maxOpacity = Math.max(...spotlights.map(s => s.opacity || 1.0));
+            
+            // Fill entire canvas with dark overlay
+            ctx.fillStyle = `rgba(0, 0, 0, ${0.7 * maxOpacity})`;
+            ctx.fillRect(0, 0, ctx.canvas.width, ctx.canvas.height);
+            
+            // Cut out completely transparent holes for all spotlights
+            ctx.globalCompositeOperation = 'destination-out';
+            ctx.fillStyle = 'rgba(0, 0, 0, 1)'; // Solid black to ensure complete removal
+            for (const spotlight of spotlights) {
+                ctx.beginPath();
+                ctx.arc(spotlight.x - offX, spotlight.y - offY, spotlight.radius, 0, 2 * Math.PI);
+                ctx.fill();
+            }
+            
+            ctx.restore();
+        }
+
+        function renderOverlay() {
+            if (!_overlay || !_overlayCtx) return;
+            _overlayCtx.clearRect(0, 0, _overlay.width, _overlay.height);
+            const offX = containerScrollLeft();
+            const offY = containerScrollTop();
+            // Draw non-spotlight shapes first
+            for (const s of _shapes) {
+                const opacity = s.opacity !== undefined ? s.opacity : 1.0;
+                if (s.type === 'arrow') {
+                    drawArrow(_overlayCtx, s.x1 - offX, s.y1 - offY, s.x2 - offX, s.y2 - offY, s.color || '#f00', s.width || 2, opacity);
+                } else if (s.type === 'pen') {
+                    drawPen(_overlayCtx, s.points, s.color || '#f00', s.width || 2, offX, offY, opacity);
+                }
+            }
+            // Draw current drawing (non-spotlight)
+            if (_drawing) {
+                if (_drawing.type === 'pen') {
+                    drawPen(_overlayCtx, _drawing.points, _drawing.color, _drawing.width, offX, offY);
+                } else if (_drawing.type !== 'spotlight') {
+                    drawArrow(_overlayCtx, _drawing.x1 - offX, _drawing.y1 - offY, _drawing.x2 - offX, _drawing.y2 - offY, _drawing.color, _drawing.width);
+                }
+            }
+            
+            // Collect all spotlights (existing + current drawing + cursor preview)
+            const spotlights = [];
+            
+            // Add existing spotlight shapes
+            for (const s of _shapes) {
+                if (s.type === 'spotlight') {
+                    spotlights.push({
+                        x: s.x,
+                        y: s.y, 
+                        radius: s.radius,
+                        opacity: s.opacity !== undefined ? s.opacity : 1.0
+                    });
+                }
+            }
+            
+            // Add current spotlight being drawn
+            if (_drawing && _drawing.type === 'spotlight') {
+                spotlights.push({
+                    x: _drawing.x,
+                    y: _drawing.y,
+                    radius: _drawing.radius,
+                    opacity: 1.0
+                });
+            }
+            
+            // Add cursor preview spotlight if tool is active
+            if (_cursorVisible && !_drawing) {
+                const tool = document.body.dataset.tool;
+                if (tool === 'spotlight') {
+                    const thickness = getLineThickness();
+                    const radius = thickness * 20;
+                    const cursorWorldX = _cursorX + containerScrollLeft();
+                    const cursorWorldY = _cursorY + containerScrollTop();
+                    spotlights.push({
+                        x: cursorWorldX,
+                        y: cursorWorldY,
+                        radius: radius,
+                        opacity: 0.8
+                    });
+                }
+            }
+            
+            // Draw all spotlights as a single overlay with multiple holes
+            drawAllSpotlights(_overlayCtx, spotlights, offX, offY);
+            
+            // Draw cursor indicators for non-spotlight tools
+            if (_cursorVisible && !_drawing) {
+                const tool = document.body.dataset.tool;
+                const color = getArrowColor();
+                const thickness = getLineThickness();
+                
+                if (tool !== 'spotlight') {
+                    _overlayCtx.save();
+                    _overlayCtx.fillStyle = color;
+                    _overlayCtx.globalAlpha = 0.7;
+                    
+                    if (tool === 'eraser') {
+                        // Draw eraser indicator
+                        _overlayCtx.strokeStyle = color;
+                        _overlayCtx.lineWidth = 2;
+                        _overlayCtx.beginPath();
+                        _overlayCtx.arc(_cursorX, _cursorY, 10, 0, 2 * Math.PI);
+                        _overlayCtx.stroke();
+                    } else {
+                        // Draw dot for pen/arrow
+                        _overlayCtx.beginPath();
+                        _overlayCtx.arc(_cursorX, _cursorY, thickness / 2, 0, 2 * Math.PI);
+                        _overlayCtx.fill();
+                    }
+                    
+                    _overlayCtx.restore();
+                }
+            }
+        }
+
+        function setOverlayActive(active) {
+            if (!_overlay) initOverlay();
+            _overlay.style.pointerEvents = active ? 'auto' : 'none';
+            _overlay.style.cursor = active ? 'none' : 'auto';
+            // Re-render to ensure visibility aligns with content
+            renderOverlay();
+        }
+
+        function initOverlay() {
+            if (_overlay) return;
+            updateOverlayModeAndContainer();
+            _overlay = document.createElement('canvas');
+            _overlay.className = 'draw-overlay';
+            _overlayCtx = _overlay.getContext('2d');
+            document.body.appendChild(_overlay);
+            updateOverlayBounds();
+            loadShapes();
+            renderOverlay();
+
+            // Events
+            _overlay.addEventListener('mousedown', onPointerDown);
+            _overlay.addEventListener('mousemove', onPointerMove);
+            _overlay.addEventListener('mouseenter', onPointerEnter);
+            _overlay.addEventListener('mouseleave', onPointerLeave);
+            document.addEventListener('mouseup', onPointerUp);
+            _overlay.addEventListener('touchstart', onPointerDown, { passive: false });
+            _overlay.addEventListener('touchmove', onPointerMove, { passive: false });
+            document.addEventListener('touchend', onPointerUp);
+
+            _overlayResizeHandler = () => updateOverlayBounds();
+            window.addEventListener('resize', _overlayResizeHandler);
+
+            _overlayScrollHandler = () => renderOverlay();
+            window.addEventListener('scroll', _overlayScrollHandler);
+            
+            // Start fade animation timer
+            _fadeTimer = setInterval(updateShapesFade, 100); // Update every 100ms for smooth fade
+        }
+
+        function rebindOverlayContainer() {
+            if (!_overlay) return;
+            // Remove old scroll handler
+            if (_overlayScrollHandler) { window.removeEventListener('scroll', _overlayScrollHandler); }
+            updateOverlayModeAndContainer();
+            updateOverlayBounds();
+            loadShapes();
+            renderOverlay();
+            _overlayScrollHandler = () => renderOverlay();
+            window.addEventListener('scroll', _overlayScrollHandler);
+        }
+
+        function teardownOverlay() {
+            if (!_overlay) return;
+            _overlay.removeEventListener('mousedown', onPointerDown);
+            _overlay.removeEventListener('mousemove', onPointerMove);
+            _overlay.removeEventListener('mouseenter', onPointerEnter);
+            _overlay.removeEventListener('mouseleave', onPointerLeave);
+            document.removeEventListener('mouseup', onPointerUp);
+            _overlay.removeEventListener('touchstart', onPointerDown);
+            _overlay.removeEventListener('touchmove', onPointerMove);
+            document.removeEventListener('touchend', onPointerUp);
+            if (_overlayResizeHandler) window.removeEventListener('resize', _overlayResizeHandler);
+            if (_overlayScrollHandler) {
+                if (_overlayContainer === window) {
+                    window.removeEventListener('scroll', _overlayScrollHandler);
+                } else if (_overlayContainer) {
+                    _overlayContainer.removeEventListener('scroll', _overlayScrollHandler);
+                }
+            }
+            if (_fadeTimer) {
+                clearInterval(_fadeTimer);
+                _fadeTimer = null;
+            }
+            if (_overlay.parentNode) _overlay.parentNode.removeChild(_overlay);
+            _overlay = null; _overlayCtx = null; _overlayContainer = null; _overlayResizeHandler = null; _overlayScrollHandler = null; _drawing = null;
+        }
+        
+        function teardownFileExplorer() {
+            const fe = document.querySelector('.file-explorer');
+            if (fe && fe.parentNode) fe.parentNode.removeChild(fe);
+        }
+
+        function escapeHtml(text) {
+            const div = document.createElement('div');
+            div.textContent = text;
+            return div.innerHTML;
+        }
+
+        function runCell(cellId){
+            const btn=document.querySelector('.run-btn[onclick*="'+cellId+'"]');
+            const output=document.getElementById('output-'+cellId);
+            if(btn){btn.textContent='⏳ running...';btn.disabled=true;}
+            if(output){output.classList.add('output-stale');}
+            fetch('/run/'+cellId,{method:'POST'}).then(r=>r.json()).then(data=>{
+                if(output){
+                    output.classList.remove('output-stale');
+                    let html='';
+                    if (data.stdout) {
+                    html += '<div class="cell-stdout"><pre class="stdout-text">'
+                        + escapeHtml(data.stdout)
+                        + '</pre></div>';
+                    }
+
+                    console.log('UV Logs:', data);
+                    if(data.stderr) {
+                        // Split UV logs from regular stderr
+                        const lines = data.stderr.split('\\n');
+                        let uvLogs = [];
+                        let regularLogs = [];
+                        let inUvSection = true;
+
+                        for (const line of lines) {
+                            if (inUvSection) {
+                                uvLogs.push(line);
+                                if (line.startsWith('Installed ')) {
+                                    inUvSection = false;
+                                }
+                            } else {
+                                regularLogs.push(line);
+                            }
+                        }
+                        
+
+                        // If we never found "Installed", treat it all as regular stderr
+                        if (inUvSection) {
+                            html+='<div class="cell-stderr">'+escapeHtml(data.stderr)+'</div>';
+                        } else {
+                            const uvLogsStr = uvLogs.join('\\n');
+                            const regularLogsStr = regularLogs.join('\\n').trim();
+
+                            if (uvLogsStr) {
+                                html+='<div class="uv-install-logs">';
+                                html+='<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>';
+                                html+='<div class="uv-logs-content" style="display: none;">'+escapeHtml(uvLogsStr)+'</div>';
+                                html+='</div>';
+                            }
+                            if (regularLogsStr) {
+                                html+='<div class="cell-stderr">'+escapeHtml(regularLogsStr)+'</div>';
+                            }
+                        }
+                    }
+                    output.innerHTML=html;
+                }
+                if(btn){btn.textContent='▶ run';btn.disabled=false;}
+            }).catch(e=>{
+                console.error('Run failed:',e);
+                if(output){output.classList.remove('output-stale');}
+                if(btn){btn.textContent='▶ run';btn.disabled=false;}
+            });
+        }
+
+        function copyCell(cellId){
+            // Try multiple selectors to find the code element
+            // Pygments generates .highlight > pre with spans, not wrapped in <code>
+            let codeElement = document.querySelector('#code-'+cellId+' .highlight pre');
+            if (!codeElement) {
+                codeElement = document.querySelector('#code-'+cellId+' pre');
+            }
+            if (!codeElement) {
+                codeElement = document.querySelector('#code-'+cellId+' code');
+            }
+            if (!codeElement) {
+                // Fallback to the code div itself
+                codeElement = document.getElementById('code-'+cellId);
+            }
+
+            const btn = document.querySelector('.copy-btn[onclick*="'+cellId+'"]');
+
+            if (!codeElement) {
+                console.error('Code element not found for cell:', cellId);
+                return;
+            }
+            if (!btn) {
+                console.error('Copy button not found for cell:', cellId);
+                return;
+            }
+
+            const codeText = codeElement.textContent;
+            
+            if (navigator.clipboard && navigator.clipboard.writeText) {
+                navigator.clipboard.writeText(codeText).then(function() {
+                    console.log('Clipboard copy successful');
+                    btn.textContent = '✓ Copied!';
+                    btn.classList.add('copied');
+                    setTimeout(function() {
+                        btn.textContent = 'Copy';
+                        btn.classList.remove('copied');
+                    }, 2000);
+                }).catch(function(err) {
+                    console.warn('Clipboard copy failed:', err);
+                    fallbackCopy();
+                });
+            } else {
+                console.log('Using fallback copy method');
+                fallbackCopy();
+            }
+            
+            function fallbackCopy() {
+                const textarea = document.createElement('textarea');
+                textarea.value = codeText;
+                textarea.style.position = 'absolute';
+                textarea.style.left = '-9999px';
+                document.body.appendChild(textarea);
+                textarea.select();
+                try {
+                    const success = document.execCommand('copy');
+                    console.log('Fallback copy success:', success);
+                    btn.textContent = '✓ Copied!';
+                    btn.classList.add('copied');
+                    setTimeout(function() {
+                        btn.textContent = 'Copy';
+                        btn.classList.remove('copied');
+                    }, 2000);
+                } catch (err) {
+                    console.error('Fallback copy failed:', err);
+                    btn.textContent = 'Copy failed';
+                    setTimeout(function() {
+                        btn.textContent = 'Copy';
+                    }, 2000);
+                }
+                document.body.removeChild(textarea);
+            }
+        }
+
+        // // Live reload functionality (robust SSE handling)
+        // (function(){
+        //     if (!('EventSource' in window)) {
+        //         console.warn('SSE not supported in this browser');
+        //         return;
+        //     }
+        //     let source = new EventSource('/events');
+        //     let isOpen = false;
+        //     source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
+        //     source.onmessage = function(e){
+        //         const msg=(e.data||'').trim(); if(!msg) return;
+        //         console.log('SSE message:', msg);
+        //         if (msg==='reload' || msg==='incremental') { location.reload(); }
+        //         // Ignore 'loading' to avoid premature reload loops
+        //     };
+        //     source.onerror = function(e){
+        //         // Let EventSource auto-reconnect instead of forcing a reload
+        //         if (isOpen) console.warn('SSE error after open, retrying...', e);
+        //     };
+        //     window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
+        // })();
+
+
+        document.addEventListener('DOMContentLoaded', function() {
+            // Capture initial cell states before any modifications
+            captureInitialCellStates();
+            
+            updateThemeIcon();
+            updateUiMenu();
+            updateUiDebug();
+            const widgetsEnabled = (document.documentElement.getAttribute('data-widgets') || 'on') === 'on';
+            if (widgetsEnabled) {
+            initMinimap();
+            initFileExplorer();
+            initTools();
+            initOverlay();
+            initStatusWidget();
+            initializeWidgetVisibility();
+            layoutWidgetsStackedBottomRight();
+            window.addEventListener('resize', layoutWidgetsStackedBottomRight);
+            }
+
+            // Apply deep-link selection if present
+            applyLocationFromUrl();
+            updateStateIndicator();
+            
+            // Apply cell states from URL immediately
+            const url = new URL(window.location.href);
+            const encodedCellStates = url.searchParams.get('cells');
+            if (encodedCellStates) {
+                console.log('Applying cell states from URL...');
+                const cellStates = decodeCellStatesFromUrl(encodedCellStates);
+                
+                // Use requestAnimationFrame to ensure DOM is ready
+                requestAnimationFrame(() => {
+                    applyCellStatesFromUrl(cellStates);
+                    
+                    // Clear initialization flag after cell states are applied
+                    if (typeof _isInitializing !== 'undefined') {
+                        _isInitializing = false;
+                    }
+                });
+            } else {
+                // Clear initialization flag even if no cell states
+                if (typeof _isInitializing !== 'undefined') {
+                    requestAnimationFrame(() => {
+                        _isInitializing = false;
+                    });
+                }
+            }
+
+            // Bind drag selection on line numbers
+            document.addEventListener('mousedown', onLineNumberMouseDown);
+            document.addEventListener('mousemove', onDocMouseMove);
+            document.addEventListener('mouseup', onDocMouseUp);
+
+            // Add ESC key handler to exit tools
+            document.addEventListener('keydown', function(e) {
+                if (e.key === 'Escape' || e.keyCode === 27) {
+                    const currentTool = document.body.dataset.tool;
+                    if (currentTool && currentTool !== 'none') {
+                        // Deactivate the current tool
+                        window.setActiveTool('none');
+                    }
+                    // Also clear any active line selection
+                    clearSelection(true);
+                }
+            });
+        });
+
+        function updateStateIndicator() {
+            try {
+                const el = document.getElementById('status-widget');
+                if (!el) return;
+                const tool = document.body.dataset.tool || 'none';
+                if (tool && tool !== 'none') {
+                    el.textContent = `tool: ${tool} — Esc`;
+                    return;
+                }
+                if (_selection) {
+                    const t = _selection.a === _selection.b ? `L${_selection.a}` : `L${_selection.a}-${_selection.b}`;
+                    el.textContent = `selected: ${t} — Esc`;
+                    return;
+                }
+                el.textContent = 'ready — Esc';
+            } catch (_) {}
+        }
+    </script>
+</head>
+
+
+<body>
+    <div class="controls">
+        <div class="controls-buttons">
+            
+            <a href="index.html" class="back-button">← back</a>
+            
+            <div class="theme-toggle" onclick="toggleTheme()">light</div>
+            <div class="reset-toggle" onclick="resetLayout()">reset</div>
+            <div class="menu-button" onclick="toggleMenu()">
+                menu ▼
+                <div class="menu-dropdown">
+                    <div class="menu-item" onclick="setUiTheme('default')">
+                        <span class="menu-checkbox" id="checkbox-ui-default">☑</span> Theme: default
+                    </div>
+                    <div class="menu-item" onclick="setUiTheme('none')">
+                        <span class="menu-checkbox" id="checkbox-ui-none">☐</span> Theme: none
+                    </div>
+                    <div class="menu-item" onclick="setUiTheme('monocolor')">
+                        <span class="menu-checkbox" id="checkbox-ui-monocolor">☐</span> Theme: monocolor
+                    </div>
+                    <div class="menu-item" onclick="toggleWidget('tools')">
+                        <span class="menu-checkbox" id="checkbox-tools">☐</span> Tools
+                    </div>
+                    <div class="menu-item" onclick="toggleWidget('file-explorer')">
+                        <span class="menu-checkbox" id="checkbox-file-explorer">☐</span> File Explorer
+                    </div>
+                    <div class="menu-item" onclick="toggleWidget('minimap')">
+                        <span class="menu-checkbox" id="checkbox-minimap">☐</span> Table of Contents
+                    </div>
+                    <div class="menu-item" onclick="toggleWidget('status')">
+                        <span class="menu-checkbox" id="checkbox-status">☑</span> Status Indicator
+                    </div>
+                </div>
+            </div>
+        </div>
+    </div>
+    
+    <div class="system-info">
+        <div class="system-info-header">Generated on:</div>
+        <div class="system-info-content">
+            Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
+        </div>
+    </div>
+    
+    <div class="main-content">
+        <h1>OpenAI-style MoE (Mixture of Experts) Benchmarks - Aggregated Results</h1>
+<p>This document combines benchmark results from multiple OpenAI-style MoE implementations.</p>
+<h2>Combined Summary and Visualization</h2>
+<div class="artifact-preview">
+<?xml version='1.0' encoding='utf-8'?>
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:ns2="http://creativecommons.org/ns#" xmlns:ns4="http://www.w3.org/1999/xlink" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" width="864pt" height="576pt" viewBox="0 0 864 576" version="1.1">
+ <metadata>
+  <rdf:RDF>
+   <ns2:Work>
+    <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+    <dc:date>2025-10-31T20:14:14.575906</dc:date>
+    <dc:format>image/svg+xml</dc:format>
+    <dc:creator>
+     <ns2:Agent>
+      <dc:title>Matplotlib v3.10.7, https://matplotlib.org/</dc:title>
+     </ns2:Agent>
+    </dc:creator>
+   </ns2:Work>
+  </rdf:RDF>
+ </metadata>
+ <defs>
+  <style type="text/css">*{stroke-linejoin: round; stroke-linecap: butt}</style>
+ </defs>
+ <g id="figure--latency" class="figure">
+  <g id="patch_1">
+   <path d="M 0 576  L 864 576  L 864 0  L 0 0  L 0 576  z " style="fill: none" />
+  </g>
+  <g id="axes--1" class="axes">
+   <g id="patch_2">
+    <path d="M 57.26 468.317269  L 845.766818 468.317269  L 845.766818 26.88  L 57.26 26.88  L 57.26 468.317269  z " style="fill: none" />
+   </g>
+   <g id="matplotlib.axis_1">
+    <g id="xtick_1">
+     <g id="grid-x--1" class="grid grid-x">
+      <path d="M 93.101219 468.317269  L 93.101219 26.88  " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_1">
+      <defs>
+       <path id="mafb3703e5b" d="M 0 0  L 0 3.5  " style="stroke: #000000; stroke-width: 0.8" />
+      </defs>
+      <g>
+       <use ns4:href="#mafb3703e5b" x="93.101219" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_1">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(62.90334 544.791615) rotate(-45)">cuda_B1_S512_E2</text>
+     </g>
+    </g>
+    <g id="xtick_2">
+     <g id="grid-x--2" class="grid grid-x">
+      <path d="M 195.504702 468.317269  L 195.504702 26.88  " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_2">
+      <g>
+       <use ns4:href="#mafb3703e5b" x="195.504702" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_2">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(165.306823 544.791615) rotate(-45)">cuda_B1_S512_E4</text>
+     </g>
+    </g>
+    <g id="xtick_3">
+     <g id="grid-x--3" class="grid grid-x">
+      <path d="M 297.908185 468.317269  L 297.908185 26.88  " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_3">
+      <g>
+       <use ns4:href="#mafb3703e5b" x="297.908185" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_3">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(265.460822 549.290582) rotate(-45)">cuda_B1_S1024_E2</text>
+     </g>
+    </g>
+    <g id="xtick_4">
+     <g id="grid-x--4" class="grid grid-x">
+      <path d="M 400.311668 468.317269  L 400.311668 26.88  " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_4">
+      <g>
+       <use ns4:href="#mafb3703e5b" x="400.311668" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_4">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(367.864305 549.290582) rotate(-45)">cuda_B1_S1024_E4</text>
+     </g>
+    </g>
+    <g id="xtick_5">
+     <g id="grid-x--5" class="grid grid-x">
+      <path d="M 502.71515 468.317269  L 502.71515 26.88  " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_5">
+      <g>
+       <use ns4:href="#mafb3703e5b" x="502.71515" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_5">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(472.517271 544.791615) rotate(-45)">cuda_B4_S512_E2</text>
+     </g>
+    </g>
+    <g id="xtick_6">
+     <g id="grid-x--6" class="grid grid-x">
+      <path d="M 605.118633 468.317269  L 605.118633 26.88  " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_6">
+      <g>
+       <use ns4:href="#mafb3703e5b" x="605.118633" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_6">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(574.920754 544.791615) rotate(-45)">cuda_B4_S512_E4</text>
+     </g>
+    </g>
+    <g id="xtick_7">
+     <g id="grid-x--7" class="grid grid-x">
+      <path d="M 707.522116 468.317269  L 707.522116 26.88  " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_7">
+      <g>
+       <use ns4:href="#mafb3703e5b" x="707.522116" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_7">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(675.074754 549.290582) rotate(-45)">cuda_B4_S1024_E2</text>
+     </g>
+    </g>
+    <g id="xtick_8">
+     <g id="grid-x--8" class="grid grid-x">
+      <path d="M 809.925599 468.317269  L 809.925599 26.88  " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_8">
+      <g>
+       <use ns4:href="#mafb3703e5b" x="809.925599" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_8">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(777.478237 549.290582) rotate(-45)">cuda_B4_S1024_E4</text>
+     </g>
+    </g>
+    <g id="label--x" class="xlabel">
+     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="451.513409" y="562.556245" transform="rotate(-0 451.513409 562.556245)">Workload</text>
+    </g>
+   </g>
+   <g id="matplotlib.axis_2">
+    <g id="ytick_1">
+     <g id="grid-y--2" class="grid grid-y">
+      <path d="M 57.26 448.946682  L 845.766818 448.946682  " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_9">
+      <defs>
+       <path id="m0fca2865ba" d="M 0 0  L -3.5 0  " style="stroke: #000000; stroke-width: 0.8" />
+      </defs>
+      <g>
+       <use ns4:href="#m0fca2865ba" x="57.26" y="448.946682" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_9">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="452.745901" transform="rotate(-0 50.26 452.745901)">0</text>
+     </g>
+    </g>
+    <g id="ytick_2">
+     <g id="grid-y--3" class="grid grid-y">
+      <path d="M 57.26 396.207913  L 845.766818 396.207913  " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_10">
+      <g>
+       <use ns4:href="#m0fca2865ba" x="57.26" y="396.207913" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_10">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="400.007132" transform="rotate(-0 50.26 400.007132)">200</text>
+     </g>
+    </g>
+    <g id="ytick_3">
+     <g id="grid-y--4" class="grid grid-y">
+      <path d="M 57.26 343.469145  L 845.766818 343.469145  " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_11">
+      <g>
+       <use ns4:href="#m0fca2865ba" x="57.26" y="343.469145" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_11">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="347.268363" transform="rotate(-0 50.26 347.268363)">400</text>
+     </g>
+    </g>
+    <g id="ytick_4">
+     <g id="grid-y--5" class="grid grid-y">
+      <path d="M 57.26 290.730376  L 845.766818 290.730376  " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_12">
+      <g>
+       <use ns4:href="#m0fca2865ba" x="57.26" y="290.730376" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_12">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="294.529595" transform="rotate(-0 50.26 294.529595)">600</text>
+     </g>
+    </g>
+    <g id="ytick_5">
+     <g id="grid-y--6" class="grid grid-y">
+      <path d="M 57.26 237.991607  L 845.766818 237.991607  " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_13">
+      <g>
+       <use ns4:href="#m0fca2865ba" x="57.26" y="237.991607" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_13">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="241.790826" transform="rotate(-0 50.26 241.790826)">800</text>
+     </g>
+    </g>
+    <g id="ytick_6">
+     <g id="grid-y--7" class="grid grid-y">
+      <path d="M 57.26 185.252839  L 845.766818 185.252839  " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_14">
+      <g>
+       <use ns4:href="#m0fca2865ba" x="57.26" y="185.252839" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_14">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="189.052057" transform="rotate(-0 50.26 189.052057)">1000</text>
+     </g>
+    </g>
+    <g id="ytick_7">
+     <g id="grid-y--8" class="grid grid-y">
+      <path d="M 57.26 132.51407  L 845.766818 132.51407  " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_15">
+      <g>
+       <use ns4:href="#m0fca2865ba" x="57.26" y="132.51407" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_15">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="136.313289" transform="rotate(-0 50.26 136.313289)">1200</text>
+     </g>
+    </g>
+    <g id="ytick_8">
+     <g id="grid-y--9" class="grid grid-y">
+      <path d="M 57.26 79.775301  L 845.766818 79.775301  " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_16">
+      <g>
+       <use ns4:href="#m0fca2865ba" x="57.26" y="79.775301" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_16">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="83.57452" transform="rotate(-0 50.26 83.57452)">1400</text>
+     </g>
+    </g>
+    <g id="ytick_9">
+     <g id="grid-y--10" class="grid grid-y">
+      <path d="M 57.26 27.036533  L 845.766818 27.036533  " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_17">
+      <g>
+       <use ns4:href="#m0fca2865ba" x="57.26" y="27.036533" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_17">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="30.835751" transform="rotate(-0 50.26 30.835751)">1600</text>
+     </g>
+    </g>
+    <g id="label--y" class="ylabel">
+     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="18.730313" y="247.598635" transform="rotate(-90 18.730313 247.598635)">Latency P50 (ms)</text>
+    </g>
+   </g>
+   <g id="series--binned-torch" class="series">
+    <path d="M 93.101219 409.379255  L 195.504702 396.140015  L 297.908185 350.644146  L 400.311668 348.035671  L 502.71515 253.280461  L 605.118633 237.49137  L 707.522116 56.970056  L 809.925599 46.94533  " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
+    <defs>
+     <path id="md7efaf3aec" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #1f77b4" />
+    </defs>
+    <g clip-path="url(#pef1bcf59f7)">
+     <use ns4:href="#md7efaf3aec" x="93.101219" y="409.379255" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="195.504702" y="396.140015" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="297.908185" y="350.644146" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="400.311668" y="348.035671" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="502.71515" y="253.280461" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="605.118633" y="237.49137" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="707.522116" y="56.970056" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="809.925599" y="46.94533" style="fill: #1f77b4; stroke: #1f77b4" />
+    </g>
+   </g>
+   <g id="series--gpt-oss-experts" class="series">
+    <path d="M 93.101219 448.251939  L 195.504702 447.922044  L 297.908185 447.94833  L 400.311668 447.56585  L 502.71515 447.170568  L 605.118633 447.00527  L 707.522116 445.444081  L 809.925599 445.46803  " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
+    <defs>
+     <path id="m9b8c54d372" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #ff7f0e" />
+    </defs>
+    <g clip-path="url(#pef1bcf59f7)">
+     <use ns4:href="#m9b8c54d372" x="93.101219" y="448.251939" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="195.504702" y="447.922044" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="297.908185" y="447.94833" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="400.311668" y="447.56585" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="502.71515" y="447.170568" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="605.118633" y="447.00527" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="707.522116" y="445.444081" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="809.925599" y="445.46803" style="fill: #ff7f0e; stroke: #ff7f0e" />
+    </g>
+   </g>
+   <g id="patch_3">
+    <path d="M 57.26 468.317269  L 57.26 26.88  " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
+   </g>
+   <g id="patch_4">
+    <path d="M 845.766818 468.317269  L 845.766818 26.88  " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
+   </g>
+   <g id="patch_5">
+    <path d="M 57.26 468.317269  L 845.766818 468.317269  " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
+   </g>
+   <g id="patch_6">
+    <path d="M 57.26 26.88  L 845.766818 26.88  " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
+   </g>
+   <g id="text_18">
+    <text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="451.513409" y="20.88" transform="rotate(-0 451.513409 20.88)">Attention Implementation Latency</text>
+   </g>
+   <g id="legend" class="legend">
+    <g id="patch_7">
+     <path d="M 64.26 64.7925  L 177.05375 64.7925  Q 179.05375 64.7925 179.05375 62.7925  L 179.05375 33.88  Q 179.05375 31.88 177.05375 31.88  L 64.26 31.88  Q 62.26 31.88 62.26 33.88  L 62.26 62.7925  Q 62.26 64.7925 64.26 64.7925  L 64.26 64.7925  z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
+    </g>
+    <g id="line2d_18">
+     <path d="M 66.26 39.978438  L 76.26 39.978438  L 86.26 39.978438  " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
+     <g>
+      <use ns4:href="#md7efaf3aec" x="76.26" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
+     </g>
+    </g>
+    <g id="legend-label--binned-torch" class="legend">
+     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="94.26" y="43.478438" transform="rotate(-0 94.26 43.478438)">binned_torch</text>
+    </g>
+    <g id="line2d_19">
+     <path d="M 66.26 54.934687  L 76.26 54.934687  L 86.26 54.934687  " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
+     <g>
+      <use ns4:href="#m9b8c54d372" x="76.26" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     </g>
+    </g>
+    <g id="legend-label--gpt-oss-experts" class="legend">
+     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="94.26" y="58.434687" transform="rotate(-0 94.26 58.434687)">gpt_oss_experts</text>
+    </g>
+   </g>
+  </g>
+ </g>
+ <defs>
+  <clipPath id="pef1bcf59f7">
+   <rect x="57.26" y="26.88" width="788.506818" height="441.437269" />
+  </clipPath>
+ </defs>
+</svg>
+</div>
+
+<div class="cell" id="cell-combine">
+<div class="cell-header">
+<span class="collapse-indicators">
+<span onclick="toggleCode('combine')" style="cursor: pointer;">▶ code</span> 
+<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
+ <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
+</span> | 
+Cell: combine | 4.26s
+ | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
+<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
+<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
+</div>
+<div id="code-combine" class="cell-code collapsed" data-lines="27">
+<div class="highlight-with-lines">
+<div class="line-numbers" id="lines-combine">
+<a class="line-number" data-cell="combine" data-line="1" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 1, true);">1</a>
+<a class="line-number" data-cell="combine" data-line="2" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 2, true);">2</a>
+<a class="line-number" data-cell="combine" data-line="3" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 3, true);">3</a>
+<a class="line-number" data-cell="combine" data-line="4" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 4, true);">4</a>
+<a class="line-number" data-cell="combine" data-line="5" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 5, true);">5</a>
+<a class="line-number" data-cell="combine" data-line="6" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 6, true);">6</a>
+<a class="line-number" data-cell="combine" data-line="7" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 7, true);">7</a>
+<a class="line-number" data-cell="combine" data-line="8" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 8, true);">8</a>
+<a class="line-number" data-cell="combine" data-line="9" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 9, true);">9</a>
+<a class="line-number" data-cell="combine" data-line="10" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 10, true);">10</a>
+<a class="line-number" data-cell="combine" data-line="11" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 11, true);">11</a>
+<a class="line-number" data-cell="combine" data-line="12" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 12, true);">12</a>
+<a class="line-number" data-cell="combine" data-line="13" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 13, true);">13</a>
+<a class="line-number" data-cell="combine" data-line="14" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 14, true);">14</a>
+<a class="line-number" data-cell="combine" data-line="15" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 15, true);">15</a>
+<a class="line-number" data-cell="combine" data-line="16" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 16, true);">16</a>
+<a class="line-number" data-cell="combine" data-line="17" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 17, true);">17</a>
+<a class="line-number" data-cell="combine" data-line="18" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 18, true);">18</a>
+<a class="line-number" data-cell="combine" data-line="19" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 19, true);">19</a>
+<a class="line-number" data-cell="combine" data-line="20" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 20, true);">20</a>
+<a class="line-number" data-cell="combine" data-line="21" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 21, true);">21</a>
+<a class="line-number" data-cell="combine" data-line="22" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 22, true);">22</a>
+<a class="line-number" data-cell="combine" data-line="23" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 23, true);">23</a>
+<a class="line-number" data-cell="combine" data-line="24" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 24, true);">24</a>
+<a class="line-number" data-cell="combine" data-line="25" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 25, true);">25</a>
+<a class="line-number" data-cell="combine" data-line="26" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 26, true);">26</a>
+<a class="line-number" data-cell="combine" data-line="27" href="#cell-combine" onclick="event.preventDefault(); selectCellLine('combine', 27, true);">27</a>
+</div>
+<div class="code-wrap">
+<div class="highlight"><pre><span></span><span class="c1"># /// script</span>
+<span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
+<span class="c1"># dependencies = [</span>
+<span class="c1">#     &quot;numpy&quot;,</span>
+<span class="c1">#     &quot;torch==2.8.0&quot;,</span>
+<span class="c1">#     &quot;kernels-benchmark-tools&quot;,</span>
+<span class="c1">#     &quot;matplotlib&quot;,</span>
+<span class="c1"># ]</span>
+<span class="c1">#</span>
+<span class="c1"># [tool.uv.sources]</span>
+<span class="c1"># kernels-benchmark-tools = { path = &quot;../../../../../tools&quot;, editable = true }</span>
+<span class="c1"># ///</span>
+<span class="kn">from</span><span class="w"> </span><span class="nn">kernels_benchmark_tools.core.visuals</span><span class="w"> </span><span class="kn">import</span> <span class="n">generate_combined_results</span>
+
+<span class="c1"># Map display names to uvnote environment variables</span>
+<span class="n">cache_env_map</span> <span class="o">=</span> <span class="p">{</span>
+    <span class="c1"># &quot;PyTorch OpenAI MoE&quot;: &quot;UVNOTE_FILE_TORCH_OPENAI_MOE_BENCHMARK&quot;,</span>
+    <span class="s2">&quot;Binned PyTorch&quot;</span><span class="p">:</span> <span class="s2">&quot;UVNOTE_FILE_BINNED_TORCH_BENCHMARK&quot;</span><span class="p">,</span>
+    <span class="s2">&quot;GptOssExperts&quot;</span><span class="p">:</span> <span class="s2">&quot;UVNOTE_FILE_GPT_OSS_MOE_BENCHMARK&quot;</span><span class="p">,</span>
+<span class="p">}</span>
+
+<span class="c1"># Generate combined results with visualization</span>
+<span class="n">generate_combined_results</span><span class="p">(</span>
+    <span class="n">cache_env_map</span><span class="o">=</span><span class="n">cache_env_map</span><span class="p">,</span>
+    <span class="n">output_filename</span><span class="o">=</span><span class="s2">&quot;openai_moe.jsonl&quot;</span><span class="p">,</span>
+    <span class="n">svg_filename</span><span class="o">=</span><span class="s2">&quot;latency.svg&quot;</span>
+<span class="p">)</span>
+</pre></div>
+
+<div class="code-line-highlight" id="line-highlight-combine"></div>
+</div>
+</div>
+</div>
+<div id="output-combine" class="cell-output">
+<div class="cell-stdout"><pre class="stdout-text">======================================================================
+LOADING BENCHMARK DATA
+======================================================================
+✓ Binned PyTorch                : /__w/kernels-benchmarks/kernels-benchmarks/benches/openai_moe/impls/.uvnote/cache/fd01907ce582015b5dd52e56081cc8e2a21813f73271b422308d60a8ab9391af
+✓ GptOssExperts                 : /__w/kernels-benchmarks/kernels-benchmarks/benches/openai_moe/impls/.uvnote/cache/002e3e7d42f2dbf6d5e5216db57e56aa649bc6ac59ce4131ce80c5849e52482b
+
+  ✓ Found Binned PyTorch
+     Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/openai_moe/impls/.uvnote/cache/fd01907ce582015b5dd52e56081cc8e2a21813f73271b422308d60a8ab9391af/openai_moe.jsonl
+  ✓ Found GptOssExperts
+     Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/openai_moe/impls/.uvnote/cache/002e3e7d42f2dbf6d5e5216db57e56aa649bc6ac59ce4131ce80c5849e52482b/openai_moe.jsonl
+
+======================================================================
+Summary: 2 found, 0 skipped, 0 missing
+======================================================================
+
+COMBINED BENCHMARK SUMMARY
+
+impl                     wl                  p50(ms)  ok
+binned_torch             cuda_B1_S1024_E2     372.79  True
+binned_torch             cuda_B1_S1024_E4     382.68  True
+binned_torch             cuda_B1_S512_E2      150.05  True
+binned_torch             cuda_B1_S512_E4      200.26  True
+binned_torch             cuda_B4_S1024_E2    1486.48  True
+binned_torch             cuda_B4_S1024_E4    1524.50  True
+binned_torch             cuda_B4_S512_E2      742.02  True
+binned_torch             cuda_B4_S512_E4      801.90  True
+gpt_oss_experts          cuda_B1_S1024_E2       3.79  True
+gpt_oss_experts          cuda_B1_S1024_E4       5.24  True
+gpt_oss_experts          cuda_B1_S512_E2        2.63  True
+gpt_oss_experts          cuda_B1_S512_E4        3.89  True
+gpt_oss_experts          cuda_B4_S1024_E2      13.28  True
+gpt_oss_experts          cuda_B4_S1024_E4      13.19  True
+gpt_oss_experts          cuda_B4_S512_E2        6.74  True
+gpt_oss_experts          cuda_B4_S512_E4        7.36  True
+
+GENERATING COMBINED VISUALIZATION
+
+Loaded 16 records
+✓ Visualization saved as latency.svg
+Saved latency.png
+✓ Visualization saved as latency.svg
+✓ SVG visualization ready!
+
+ANALYSIS COMPLETE
+Total implementations analyzed: 2
+
+Implementations included:
+  ✓ Binned PyTorch
+  ✓ GptOssExperts
+</pre></div>
+<div class="uv-install-logs" id="uv-logs-combine">
+<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
+<div class="uv-logs-content" style="display: none;">
+Installed 37 packages in 196ms
+</div>
+</div>
+<div class="cell-artifacts">
+<h4>Artifacts:</h4>
+<a href="artifacts/combine/latency.svg" class="artifact" target="_blank">latency.svg</a>
+<div class="artifact-preview">
+<?xml version='1.0' encoding='utf-8'?>
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:ns2="http://creativecommons.org/ns#" xmlns:ns4="http://www.w3.org/1999/xlink" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" width="864pt" height="576pt" viewBox="0 0 864 576" version="1.1">
+ <metadata>
+  <rdf:RDF>
+   <ns2:Work>
+    <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+    <dc:date>2025-10-31T20:14:14.575906</dc:date>
+    <dc:format>image/svg+xml</dc:format>
+    <dc:creator>
+     <ns2:Agent>
+      <dc:title>Matplotlib v3.10.7, https://matplotlib.org/</dc:title>
+     </ns2:Agent>
+    </dc:creator>
+   </ns2:Work>
+  </rdf:RDF>
+ </metadata>
+ <defs>
+  <style type="text/css">*{stroke-linejoin: round; stroke-linecap: butt}</style>
+ </defs>
+ <g id="figure--latency" class="figure">
+  <g id="patch_1">
+   <path d="M 0 576  L 864 576  L 864 0  L 0 0  L 0 576  z " style="fill: none" />
+  </g>
+  <g id="axes--1" class="axes">
+   <g id="patch_2">
+    <path d="M 57.26 468.317269  L 845.766818 468.317269  L 845.766818 26.88  L 57.26 26.88  L 57.26 468.317269  z " style="fill: none" />
+   </g>
+   <g id="matplotlib.axis_1">
+    <g id="xtick_1">
+     <g id="grid-x--1" class="grid grid-x">
+      <path d="M 93.101219 468.317269  L 93.101219 26.88  " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_1">
+      <defs>
+       <path id="mafb3703e5b" d="M 0 0  L 0 3.5  " style="stroke: #000000; stroke-width: 0.8" />
+      </defs>
+      <g>
+       <use ns4:href="#mafb3703e5b" x="93.101219" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_1">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(62.90334 544.791615) rotate(-45)">cuda_B1_S512_E2</text>
+     </g>
+    </g>
+    <g id="xtick_2">
+     <g id="grid-x--2" class="grid grid-x">
+      <path d="M 195.504702 468.317269  L 195.504702 26.88  " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_2">
+      <g>
+       <use ns4:href="#mafb3703e5b" x="195.504702" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_2">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(165.306823 544.791615) rotate(-45)">cuda_B1_S512_E4</text>
+     </g>
+    </g>
+    <g id="xtick_3">
+     <g id="grid-x--3" class="grid grid-x">
+      <path d="M 297.908185 468.317269  L 297.908185 26.88  " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_3">
+      <g>
+       <use ns4:href="#mafb3703e5b" x="297.908185" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_3">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(265.460822 549.290582) rotate(-45)">cuda_B1_S1024_E2</text>
+     </g>
+    </g>
+    <g id="xtick_4">
+     <g id="grid-x--4" class="grid grid-x">
+      <path d="M 400.311668 468.317269  L 400.311668 26.88  " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_4">
+      <g>
+       <use ns4:href="#mafb3703e5b" x="400.311668" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_4">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(367.864305 549.290582) rotate(-45)">cuda_B1_S1024_E4</text>
+     </g>
+    </g>
+    <g id="xtick_5">
+     <g id="grid-x--5" class="grid grid-x">
+      <path d="M 502.71515 468.317269  L 502.71515 26.88  " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_5">
+      <g>
+       <use ns4:href="#mafb3703e5b" x="502.71515" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_5">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(472.517271 544.791615) rotate(-45)">cuda_B4_S512_E2</text>
+     </g>
+    </g>
+    <g id="xtick_6">
+     <g id="grid-x--6" class="grid grid-x">
+      <path d="M 605.118633 468.317269  L 605.118633 26.88  " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_6">
+      <g>
+       <use ns4:href="#mafb3703e5b" x="605.118633" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_6">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(574.920754 544.791615) rotate(-45)">cuda_B4_S512_E4</text>
+     </g>
+    </g>
+    <g id="xtick_7">
+     <g id="grid-x--7" class="grid grid-x">
+      <path d="M 707.522116 468.317269  L 707.522116 26.88  " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_7">
+      <g>
+       <use ns4:href="#mafb3703e5b" x="707.522116" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_7">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(675.074754 549.290582) rotate(-45)">cuda_B4_S1024_E2</text>
+     </g>
+    </g>
+    <g id="xtick_8">
+     <g id="grid-x--8" class="grid grid-x">
+      <path d="M 809.925599 468.317269  L 809.925599 26.88  " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_8">
+      <g>
+       <use ns4:href="#mafb3703e5b" x="809.925599" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_8">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(777.478237 549.290582) rotate(-45)">cuda_B4_S1024_E4</text>
+     </g>
+    </g>
+    <g id="label--x" class="xlabel">
+     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="451.513409" y="562.556245" transform="rotate(-0 451.513409 562.556245)">Workload</text>
+    </g>
+   </g>
+   <g id="matplotlib.axis_2">
+    <g id="ytick_1">
+     <g id="grid-y--2" class="grid grid-y">
+      <path d="M 57.26 448.946682  L 845.766818 448.946682  " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_9">
+      <defs>
+       <path id="m0fca2865ba" d="M 0 0  L -3.5 0  " style="stroke: #000000; stroke-width: 0.8" />
+      </defs>
+      <g>
+       <use ns4:href="#m0fca2865ba" x="57.26" y="448.946682" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_9">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="452.745901" transform="rotate(-0 50.26 452.745901)">0</text>
+     </g>
+    </g>
+    <g id="ytick_2">
+     <g id="grid-y--3" class="grid grid-y">
+      <path d="M 57.26 396.207913  L 845.766818 396.207913  " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_10">
+      <g>
+       <use ns4:href="#m0fca2865ba" x="57.26" y="396.207913" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_10">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="400.007132" transform="rotate(-0 50.26 400.007132)">200</text>
+     </g>
+    </g>
+    <g id="ytick_3">
+     <g id="grid-y--4" class="grid grid-y">
+      <path d="M 57.26 343.469145  L 845.766818 343.469145  " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_11">
+      <g>
+       <use ns4:href="#m0fca2865ba" x="57.26" y="343.469145" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_11">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="347.268363" transform="rotate(-0 50.26 347.268363)">400</text>
+     </g>
+    </g>
+    <g id="ytick_4">
+     <g id="grid-y--5" class="grid grid-y">
+      <path d="M 57.26 290.730376  L 845.766818 290.730376  " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_12">
+      <g>
+       <use ns4:href="#m0fca2865ba" x="57.26" y="290.730376" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_12">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="294.529595" transform="rotate(-0 50.26 294.529595)">600</text>
+     </g>
+    </g>
+    <g id="ytick_5">
+     <g id="grid-y--6" class="grid grid-y">
+      <path d="M 57.26 237.991607  L 845.766818 237.991607  " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_13">
+      <g>
+       <use ns4:href="#m0fca2865ba" x="57.26" y="237.991607" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_13">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="241.790826" transform="rotate(-0 50.26 241.790826)">800</text>
+     </g>
+    </g>
+    <g id="ytick_6">
+     <g id="grid-y--7" class="grid grid-y">
+      <path d="M 57.26 185.252839  L 845.766818 185.252839  " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_14">
+      <g>
+       <use ns4:href="#m0fca2865ba" x="57.26" y="185.252839" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_14">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="189.052057" transform="rotate(-0 50.26 189.052057)">1000</text>
+     </g>
+    </g>
+    <g id="ytick_7">
+     <g id="grid-y--8" class="grid grid-y">
+      <path d="M 57.26 132.51407  L 845.766818 132.51407  " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_15">
+      <g>
+       <use ns4:href="#m0fca2865ba" x="57.26" y="132.51407" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_15">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="136.313289" transform="rotate(-0 50.26 136.313289)">1200</text>
+     </g>
+    </g>
+    <g id="ytick_8">
+     <g id="grid-y--9" class="grid grid-y">
+      <path d="M 57.26 79.775301  L 845.766818 79.775301  " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_16">
+      <g>
+       <use ns4:href="#m0fca2865ba" x="57.26" y="79.775301" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_16">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="83.57452" transform="rotate(-0 50.26 83.57452)">1400</text>
+     </g>
+    </g>
+    <g id="ytick_9">
+     <g id="grid-y--10" class="grid grid-y">
+      <path d="M 57.26 27.036533  L 845.766818 27.036533  " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+     </g>
+     <g id="line2d_17">
+      <g>
+       <use ns4:href="#m0fca2865ba" x="57.26" y="27.036533" style="stroke: #000000; stroke-width: 0.8" />
+      </g>
+     </g>
+     <g id="text_17">
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="30.835751" transform="rotate(-0 50.26 30.835751)">1600</text>
+     </g>
+    </g>
+    <g id="label--y" class="ylabel">
+     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="18.730313" y="247.598635" transform="rotate(-90 18.730313 247.598635)">Latency P50 (ms)</text>
+    </g>
+   </g>
+   <g id="series--binned-torch" class="series">
+    <path d="M 93.101219 409.379255  L 195.504702 396.140015  L 297.908185 350.644146  L 400.311668 348.035671  L 502.71515 253.280461  L 605.118633 237.49137  L 707.522116 56.970056  L 809.925599 46.94533  " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
+    <defs>
+     <path id="md7efaf3aec" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #1f77b4" />
+    </defs>
+    <g clip-path="url(#pef1bcf59f7)">
+     <use ns4:href="#md7efaf3aec" x="93.101219" y="409.379255" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="195.504702" y="396.140015" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="297.908185" y="350.644146" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="400.311668" y="348.035671" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="502.71515" y="253.280461" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="605.118633" y="237.49137" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="707.522116" y="56.970056" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="809.925599" y="46.94533" style="fill: #1f77b4; stroke: #1f77b4" />
+    </g>
+   </g>
+   <g id="series--gpt-oss-experts" class="series">
+    <path d="M 93.101219 448.251939  L 195.504702 447.922044  L 297.908185 447.94833  L 400.311668 447.56585  L 502.71515 447.170568  L 605.118633 447.00527  L 707.522116 445.444081  L 809.925599 445.46803  " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
+    <defs>
+     <path id="m9b8c54d372" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #ff7f0e" />
+    </defs>
+    <g clip-path="url(#pef1bcf59f7)">
+     <use ns4:href="#m9b8c54d372" x="93.101219" y="448.251939" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="195.504702" y="447.922044" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="297.908185" y="447.94833" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="400.311668" y="447.56585" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="502.71515" y="447.170568" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="605.118633" y="447.00527" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="707.522116" y="445.444081" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="809.925599" y="445.46803" style="fill: #ff7f0e; stroke: #ff7f0e" />
+    </g>
+   </g>
+   <g id="patch_3">
+    <path d="M 57.26 468.317269  L 57.26 26.88  " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
+   </g>
+   <g id="patch_4">
+    <path d="M 845.766818 468.317269  L 845.766818 26.88  " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
+   </g>
+   <g id="patch_5">
+    <path d="M 57.26 468.317269  L 845.766818 468.317269  " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
+   </g>
+   <g id="patch_6">
+    <path d="M 57.26 26.88  L 845.766818 26.88  " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
+   </g>
+   <g id="text_18">
+    <text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="451.513409" y="20.88" transform="rotate(-0 451.513409 20.88)">Attention Implementation Latency</text>
+   </g>
+   <g id="legend" class="legend">
+    <g id="patch_7">
+     <path d="M 64.26 64.7925  L 177.05375 64.7925  Q 179.05375 64.7925 179.05375 62.7925  L 179.05375 33.88  Q 179.05375 31.88 177.05375 31.88  L 64.26 31.88  Q 62.26 31.88 62.26 33.88  L 62.26 62.7925  Q 62.26 64.7925 64.26 64.7925  L 64.26 64.7925  z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
+    </g>
+    <g id="line2d_18">
+     <path d="M 66.26 39.978438  L 76.26 39.978438  L 86.26 39.978438  " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
+     <g>
+      <use ns4:href="#md7efaf3aec" x="76.26" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
+     </g>
+    </g>
+    <g id="legend-label--binned-torch" class="legend">
+     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="94.26" y="43.478438" transform="rotate(-0 94.26 43.478438)">binned_torch</text>
+    </g>
+    <g id="line2d_19">
+     <path d="M 66.26 54.934687  L 76.26 54.934687  L 86.26 54.934687  " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
+     <g>
+      <use ns4:href="#m9b8c54d372" x="76.26" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     </g>
+    </g>
+    <g id="legend-label--gpt-oss-experts" class="legend">
+     <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="94.26" y="58.434687" transform="rotate(-0 94.26 58.434687)">gpt_oss_experts</text>
+    </g>
+   </g>
+  </g>
+ </g>
+ <defs>
+  <clipPath id="pef1bcf59f7">
+   <rect x="57.26" y="26.88" width="788.506818" height="441.437269" />
+  </clipPath>
+ </defs>
+</svg>
+</div>
+</div>
+</div>
+</div>
+    </div>
+    
+</body>
+</html>
\ No newline at end of file
diff --git a/openai_moe/results/index.html b/openai_moe/results/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..f90c14f2561df54bf5684b44db4e9111a3233a73
--- /dev/null
+++ b/openai_moe/results/index.html
@@ -0,0 +1,88 @@
+<!DOCTYPE html>
+<html>
+<head>
+  <meta charset='UTF-8'>
+  <meta name='viewport' content='width=device-width, initial-scale=1.0'>
+  <title>Index of /openai_moe/results</title>
+  <style>
+    :root {
+      --bg-primary: #0a0a0a;
+      --bg-secondary: #121212;
+      --bg-tertiary: #181818;
+      --text-primary: #e0e0e0;
+      --text-secondary: #888888;
+      --text-link: #64b5f6;
+      --border-primary: #2a2a2a;
+    }
+    body {
+      font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
+      background: var(--bg-primary);
+      color: var(--text-primary);
+      margin: 0;
+      padding: 16px;
+      max-width: 900px;
+      margin: 0 auto;
+    }
+    .controls {
+      display: flex;
+      justify-content: flex-end;
+      margin-bottom: 1rem;
+    }
+    .back-button {
+      background: var(--bg-secondary);
+      border: 1px solid var(--border-primary);
+      padding: 8px 12px;
+      border-radius: 4px;
+      color: var(--text-secondary);
+      cursor: pointer;
+      font-size: 0.9rem;
+      text-decoration: none;
+      display: inline-block;
+    }
+    .back-button:hover {
+      color: var(--text-primary);
+      background: var(--bg-tertiary);
+    }
+    h1 {
+      font-size: 1.5em;
+      margin: 1rem 0;
+      color: var(--text-primary);
+      border-bottom: 1px solid var(--border-primary);
+      padding-bottom: 0.5rem;
+    }
+    ul {
+      list-style-type: none;
+      padding: 0;
+    }
+    li {
+      margin: 0;
+      border-bottom: 1px solid var(--border-primary);
+    }
+    li:last-child {
+      border-bottom: none;
+    }
+    a {
+      display: block;
+      padding: 0.75rem 0.5rem;
+      text-decoration: none;
+      color: var(--text-link);
+      transition: background 0.2s ease;
+    }
+    a:hover {
+      background: var(--bg-secondary);
+    }
+    .dir {
+      font-weight: 500;
+    }
+  </style>
+</head>
+<body>
+  <div class='controls'>
+    <a href='../index.html' class='back-button'>← back</a>
+  </div>
+  <h1>Index of /openai_moe/results</h1>
+  <ul>
+    <li><a href='combined_results.html' class='file'>combined_results.html</a></li>
+  </ul>
+</body>
+</html>
\ No newline at end of file
diff --git a/rotary/impls/artifacts/benchmark/rotary.jsonl b/rotary/impls/artifacts/benchmark/rotary.jsonl
index 1c45cd7ef0b4126f1b9f9093527991af5f22312d..2f046365b897b6b0052a6d0c4d2d39bda02f57ee 100644
--- a/rotary/impls/artifacts/benchmark/rotary.jsonl
+++ b/rotary/impls/artifacts/benchmark/rotary.jsonl
@@ -1,24 +1,24 @@
-{"ts": "2025-10-30T15:53:24Z", "run": "2249f950b26b464fbf028ad8f9536606", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H8_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.17100299999128765, "p50": 0.1746739999930469, "p90": 0.1748229999520845, "mean": 0.17718919998515048, "iqr": 0.0008499999921696144, "raw_times": [0.1739729999599149, 0.19147300002941847, 0.17100299999128765, 0.1748229999520845, 0.1746739999930469], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.1834729999927731, "peak_bytes": 1720320, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-30T15:53:24Z", "run": "2249f950b26b464fbf028ad8f9536606", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H8_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2229240000133359, "p50": 0.22586399995816464, "p90": 0.2263739999648351, "mean": 0.2256739999893398, "iqr": 0.0011099999710495467, "raw_times": [0.2229240000133359, 0.22586399995816464, 0.2263739999648351, 0.22526399999378555, 0.22794400001657777], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22923400001673144, "peak_bytes": 3440640, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-30T15:53:24Z", "run": "2249f950b26b464fbf028ad8f9536606", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H32_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22077400001307979, "p50": 0.2269739999860576, "p90": 0.2270040000098561, "mean": 0.2274739999961639, "iqr": 0.0016900000332498166, "raw_times": [0.22531399997660628, 0.2269739999860576, 0.23730399999521978, 0.22077400001307979, 0.2270040000098561], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23222400000122434, "peak_bytes": 6832128, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-30T15:53:24Z", "run": "2249f950b26b464fbf028ad8f9536606", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H32_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21741400001928923, "p50": 0.2200139999786188, "p90": 0.22255300001461364, "mean": 0.22600780001766907, "iqr": 0.0031789999752618314, "raw_times": [0.2200139999786188, 0.21741400001928923, 0.22255300001461364, 0.2193740000393518, 0.25068400003647184], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22356400000944632, "peak_bytes": 13664256, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-30T15:53:24Z", "run": "2249f950b26b464fbf028ad8f9536606", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H8_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22311399999352943, "p50": 0.22467400003733928, "p90": 0.22555399999646397, "mean": 0.22520960002339052, "iqr": 0.0019609999526437605, "raw_times": [0.22555399999646397, 0.22467400003733928, 0.2291130000457997, 0.22311399999352943, 0.2235930000438202], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23042399999440022, "peak_bytes": 6881280, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-30T15:53:24Z", "run": "2249f950b26b464fbf028ad8f9536606", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H8_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21832400000221241, "p50": 0.21977400001560454, "p90": 0.22210299999869676, "mean": 0.22257580001223687, "iqr": 0.002768999991076271, "raw_times": [0.21832400000221241, 0.22210299999869676, 0.23334400003705014, 0.2193340000076205, 0.21977400001560454], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23037299996531146, "peak_bytes": 13762560, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-30T15:53:24Z", "run": "2249f950b26b464fbf028ad8f9536606", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H32_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21834399996123466, "p50": 0.22027399995749875, "p90": 0.22137399997745888, "mean": 0.22062599997525467, "iqr": 0.00113999999484804, "raw_times": [0.22290399999747024, 0.22027399995749875, 0.22137399997745888, 0.22023399998261084, 0.21834399996123466], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22569399999383677, "peak_bytes": 27328512, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-30T15:53:24Z", "run": "2249f950b26b464fbf028ad8f9536606", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H32_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22165399997220447, "p50": 0.22276400000009744, "p90": 0.2233839999803422, "mean": 0.22286399999984496, "iqr": 0.0009499999578110874, "raw_times": [0.2224340000225311, 0.22165399997220447, 0.2233839999803422, 0.22276400000009744, 0.2240840000240496], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2297839999982898, "peak_bytes": 54657024, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-30T15:53:24Z", "run": "2249f950b26b464fbf028ad8f9536606", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H8_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22094399997740766, "p50": 0.22392400001081114, "p90": 0.226194999981999, "mean": 0.22451620000083494, "iqr": 0.003770999967400712, "raw_times": [0.22094399997740766, 0.22392400001081114, 0.22909400001935865, 0.226194999981999, 0.22242400001459828], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22784399999409288, "peak_bytes": 27525120, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-30T15:53:24Z", "run": "2249f950b26b464fbf028ad8f9536606", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H8_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22157400002242866, "p50": 0.22411399999100468, "p90": 0.22636400001374568, "mean": 0.22692980001011165, "iqr": 0.0029010000162088545, "raw_times": [0.22346299999753683, 0.22636400001374568, 0.2391340000258424, 0.22157400002242866, 0.22411399999100468], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22614400000975365, "peak_bytes": 55050240, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-30T15:53:24Z", "run": "2249f950b26b464fbf028ad8f9536606", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H32_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22298400000408947, "p50": 0.2238440000041919, "p90": 0.225143999955435, "mean": 0.22477019999769254, "iqr": 0.0013799999578623101, "raw_times": [0.22376399999757268, 0.22298400000408947, 0.22811500002717366, 0.225143999955435, 0.2238440000041919], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22483399999373432, "peak_bytes": 109314048, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-30T15:53:24Z", "run": "2249f950b26b464fbf028ad8f9536606", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H32_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22621400000844005, "p50": 0.22858399995584477, "p90": 0.22970399999167057, "mean": 0.22850999999945998, "iqr": 0.003459999959432025, "raw_times": [0.22624400003223855, 0.22858399995584477, 0.22621400000844005, 0.23180400000910595, 0.22970399999167057], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22907400000349298, "peak_bytes": 218628096, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-30T15:53:24Z", "run": "2249f950b26b464fbf028ad8f9536606", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H8_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22101400003293747, "p50": 0.224504000016168, "p90": 0.22463400000560796, "mean": 0.22994020000624005, "iqr": 0.0010800000040944724, "raw_times": [0.22101400003293747, 0.22463400000560796, 0.2559949999749733, 0.224504000016168, 0.2235540000015135], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2261639999687759, "peak_bytes": 68698112, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-30T15:53:24Z", "run": "2249f950b26b464fbf028ad8f9536606", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H8_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22005400001035014, "p50": 0.22374300004912584, "p90": 0.22502399997392786, "mean": 0.2236157999959687, "iqr": 0.0018200000226897828, "raw_times": [0.22374300004912584, 0.2260539999952016, 0.22320399995123807, 0.22502399997392786, 0.22005400001035014], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2302039999904082, "peak_bytes": 6848512, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-30T15:53:24Z", "run": "2249f950b26b464fbf028ad8f9536606", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H32_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22128399996290682, "p50": 0.22392400001081114, "p90": 0.22476399999504793, "mean": 0.22437599998283986, "iqr": 0.0009500000146545062, "raw_times": [0.22381399998039342, 0.22809399996503998, 0.22476399999504793, 0.22392400001081114, 0.22128399996290682], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23023399995736327, "peak_bytes": 13647872, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-30T15:53:24Z", "run": "2249f950b26b464fbf028ad8f9536606", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H32_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21981399999049245, "p50": 0.22491400000035355, "p90": 0.2271139999834304, "mean": 0.2256657999851086, "iqr": 0.0057999999967250915, "raw_times": [0.2213139999867053, 0.2351729999645613, 0.21981399999049245, 0.22491400000035355, 0.2271139999834304], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22896399997307526, "peak_bytes": 27295744, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-30T15:53:24Z", "run": "2249f950b26b464fbf028ad8f9536606", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H8_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2212340000369295, "p50": 0.22300300003053053, "p90": 0.22370400000681911, "mean": 0.2228398000170273, "iqr": 0.0018700000055105193, "raw_times": [0.22300300003053053, 0.22370400000681911, 0.22442400000954876, 0.2212340000369295, 0.2218340000013086], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.24375499998541272, "peak_bytes": 13697024, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-30T15:53:24Z", "run": "2249f950b26b464fbf028ad8f9536606", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H8_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21925400000100126, "p50": 0.22213400001191985, "p90": 0.2221839999947406, "mean": 0.22427599999446102, "iqr": 0.0003700000092976552, "raw_times": [0.22181399998544293, 0.23599399997920045, 0.21925400000100126, 0.22213400001191985, 0.2221839999947406], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.255094999999983, "peak_bytes": 27394048, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-30T15:53:24Z", "run": "2249f950b26b464fbf028ad8f9536606", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H32_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22207399996432287, "p50": 0.22611399998595516, "p90": 0.22614400000975365, "mean": 0.22540399997978966, "iqr": 0.0004800000397153781, "raw_times": [0.22611399998595516, 0.22207399996432287, 0.22702399996887834, 0.22614400000975365, 0.22566399997003828], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.24224399999184243, "peak_bytes": 54591488, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-30T15:53:25Z", "run": "2249f950b26b464fbf028ad8f9536606", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H32_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21955500000103711, "p50": 0.2238039999724606, "p90": 0.22417399998175824, "mean": 0.229038399993442, "iqr": 0.0011799999697359453, "raw_times": [0.2238039999724606, 0.22417399998175824, 0.25466499999993175, 0.2229940000120223, 0.21955500000103711], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23440400002527895, "peak_bytes": 109182976, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-30T15:53:25Z", "run": "2249f950b26b464fbf028ad8f9536606", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H8_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2215840000303615, "p50": 0.22219400000267342, "p90": 0.22536399995942702, "mean": 0.22351999999727923, "iqr": 0.0037099999303791265, "raw_times": [0.2215840000303615, 0.22219400000267342, 0.22680399996488632, 0.22536399995942702, 0.2216540000290479], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22727300000724426, "peak_bytes": 54788096, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-30T15:53:25Z", "run": "2249f950b26b464fbf028ad8f9536606", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H8_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22125399999595174, "p50": 0.22540399999115834, "p90": 0.23793399998339737, "mean": 0.23205199998983517, "iqr": 0.012690000005477486, "raw_times": [0.2504240000007485, 0.22524399997791988, 0.22125399999595174, 0.23793399998339737, 0.22540399999115834], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22909400001935865, "peak_bytes": 109576192, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-30T15:53:25Z", "run": "2249f950b26b464fbf028ad8f9536606", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H32_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2288640000074338, "p50": 0.2294729999903211, "p90": 0.22959400001809627, "mean": 0.2293698000016775, "iqr": 0.0005100000066704524, "raw_times": [0.22959400001809627, 0.2294729999903211, 0.22983399998111054, 0.2288640000074338, 0.22908400001142581], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22990399997979694, "peak_bytes": 218365952, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-30T15:53:25Z", "run": "2249f950b26b464fbf028ad8f9536606", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H32_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.6347010000240516, "p50": 0.6367309999859572, "p90": 0.6407210000247687, "mean": 0.6405370000038602, "iqr": 0.004560000036235579, "raw_times": [0.6543709999959901, 0.6347010000240516, 0.6407210000247687, 0.6367309999859572, 0.6361609999885331], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.6394609999915701, "peak_bytes": 436731904, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-31T20:01:22Z", "run": "eaf2d47fcdc24840a68457c07da24ae9", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H8_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0773009999761598, "p50": 0.07878200000277502, "p90": 0.07927199999357981, "mean": 0.08125379999910365, "iqr": 0.0008899999670575198, "raw_times": [0.07927199999357981, 0.07878200000277502, 0.09253199999648132, 0.07838200002652229, 0.0773009999761598], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08711200001698671, "peak_bytes": 3178496, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.590452924915553e-08, "mae_k": 1.5487040982975486e-08, "mse_q": 2.5241010080938753e-15, "mse_k": 2.364223539299626e-15, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-31T20:01:22Z", "run": "eaf2d47fcdc24840a68457c07da24ae9", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H8_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09208300002683245, "p50": 0.09279300002162927, "p90": 0.09387199997945572, "mean": 0.09325840001110919, "iqr": 0.0014699999724143709, "raw_times": [0.09208300002683245, 0.09240200000704135, 0.09387199997945572, 0.09514200002058715, 0.09279300002162927], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0956929999915701, "peak_bytes": 6356992, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5508486939097565e-08, "mae_k": 1.567566698668088e-08, "mse_q": 2.3630110116356316e-15, "mse_k": 2.416562128626943e-15, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-31T20:01:22Z", "run": "eaf2d47fcdc24840a68457c07da24ae9", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H32_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09122299996988659, "p50": 0.09174199999506527, "p90": 0.09311200000183817, "mean": 0.09907239998483419, "iqr": 0.0014700000292577897, "raw_times": [0.12764299998480055, 0.09311200000183817, 0.09174199999506527, 0.09164199997258038, 0.09122299996988659], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09512200000472149, "peak_bytes": 12615680, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5856898016863852e-08, "mae_k": 1.572981211950264e-08, "mse_q": 2.4771055025978386e-15, "mse_k": 2.4544071371937915e-15, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-31T20:01:22Z", "run": "eaf2d47fcdc24840a68457c07da24ae9", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H32_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09014300002263553, "p50": 0.09057199997641874, "p90": 0.09099299995796173, "mean": 0.09084659998279676, "iqr": 0.0004309999894758221, "raw_times": [0.09014300002263553, 0.0905619999684859, 0.09099299995796173, 0.09057199997641874, 0.0919629999884819], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09270300000707721, "peak_bytes": 25231360, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5617658277733426e-08, "mae_k": 1.5788685914230882e-08, "mse_q": 2.4549424620164562e-15, "mse_k": 2.492823469483563e-15, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-31T20:01:22Z", "run": "eaf2d47fcdc24840a68457c07da24ae9", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H8_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09128200002805897, "p50": 0.09358200003362072, "p90": 0.09361200000057579, "mean": 0.0932360000092558, "iqr": 5.9999990753567545e-05, "raw_times": [0.09358200003362072, 0.09415199997420132, 0.09361200000057579, 0.09128200002805897, 0.09355200000982222], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09915200001842095, "peak_bytes": 12779520, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5962712041073246e-08, "mae_k": 1.5743363945830424e-08, "mse_q": 2.534145124782417e-15, "mse_k": 2.451281585618423e-15, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-31T20:01:22Z", "run": "eaf2d47fcdc24840a68457c07da24ae9", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H8_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09190200000830373, "p50": 0.09338199998865093, "p90": 0.09447299999010283, "mean": 0.09361019999687414, "iqr": 0.0011509999922054703, "raw_times": [0.09190200000830373, 0.09338199998865093, 0.09497199999941586, 0.09332199999789736, 0.09447299999010283], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09578299994927875, "peak_bytes": 25427968, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.578730035589615e-08, "mae_k": 1.5859711766097462e-08, "mse_q": 2.440287521479536e-15, "mse_k": 2.477901290051784e-15, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-31T20:01:22Z", "run": "eaf2d47fcdc24840a68457c07da24ae9", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H32_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09281299998065151, "p50": 0.09429199997157411, "p90": 0.09554199999683988, "mean": 0.0945923999893239, "iqr": 0.0018490000002202578, "raw_times": [0.09662200000093435, 0.09281299998065151, 0.09369299999661962, 0.09554199999683988, 0.09429199997157411], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09821199995485586, "peak_bytes": 50462720, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5775295736375483e-08, "mae_k": 1.5847881229547056e-08, "mse_q": 2.471039476146077e-15, "mse_k": 2.472378635235686e-15, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-31T20:01:22Z", "run": "eaf2d47fcdc24840a68457c07da24ae9", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H32_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09230199998455646, "p50": 0.09352199998602373, "p90": 0.09397200000194061, "mean": 0.09366439998075293, "iqr": 0.00047900005029077874, "raw_times": [0.09230199998455646, 0.09503299997959402, 0.09349299995164984, 0.09352199998602373, 0.09397200000194061], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09802200003150574, "peak_bytes": 100925440, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5959869870130206e-08, "mae_k": 1.588083975434529e-08, "mse_q": 2.510663677418633e-15, "mse_k": 2.502786271009168e-15, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-31T20:01:22Z", "run": "eaf2d47fcdc24840a68457c07da24ae9", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H8_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0920319999977437, "p50": 0.0931920000084574, "p90": 0.09354200000188939, "mean": 0.09315399998968132, "iqr": 0.0011200000358257967, "raw_times": [0.0931920000084574, 0.0920319999977437, 0.0924219999660636, 0.09458199997425254, 0.09354200000188939], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09710300003007433, "peak_bytes": 51118080, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5894533689220225e-08, "mae_k": 1.5873395042831362e-08, "mse_q": 2.5093181655819197e-15, "mse_k": 2.488611809911578e-15, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-31T20:01:22Z", "run": "eaf2d47fcdc24840a68457c07da24ae9", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H8_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09380299997019392, "p50": 0.0960819999704654, "p90": 0.10296200002812839, "mean": 0.0988743999982944, "iqr": 0.00756899999032612, "raw_times": [0.10613199998488199, 0.09539300003780227, 0.09380299997019392, 0.10296200002812839, 0.0960819999704654], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0969220000115456, "peak_bytes": 101711872, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5936768349433805e-08, "mae_k": 1.5960043953100467e-08, "mse_q": 2.51039008577667e-15, "mse_k": 2.5111253103748867e-15, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-31T20:01:22Z", "run": "eaf2d47fcdc24840a68457c07da24ae9", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H32_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09422199997288772, "p50": 0.0958319999995183, "p90": 0.09810200003812497, "mean": 0.09699820000150794, "iqr": 0.0028600000518963498, "raw_times": [0.0958319999995183, 0.09524199998622862, 0.09422199997288772, 0.09810200003812497, 0.10159300001078009], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09825199998658718, "peak_bytes": 201850880, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 1.9073486328125e-06, "absmax_k": 9.5367431640625e-07, "mae_q": 1.586510300910504e-08, "mae_k": 1.5813935050346117e-08, "mse_q": 2.499836478770355e-15, "mse_k": 2.4755639026338358e-15, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-31T20:01:22Z", "run": "eaf2d47fcdc24840a68457c07da24ae9", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H32_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.26106699999672855, "p50": 0.2625369999691429, "p90": 0.266995999993469, "mean": 0.2640226000039547, "iqr": 0.0046789999714746955, "raw_times": [0.2625369999691429, 0.2671960000384388, 0.2623170000219943, 0.266995999993469, 0.26106699999672855], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.26397600004202104, "peak_bytes": 403701760, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.581049247079136e-08, "mae_k": 1.5861061797295406e-08, "mse_q": 2.4735094242202705e-15, "mse_k": 2.486832828964107e-15, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-31T20:01:22Z", "run": "eaf2d47fcdc24840a68457c07da24ae9", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H8_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09289299998727074, "p50": 0.09412200000724624, "p90": 0.0941720000469104, "mean": 0.09422220001624737, "iqr": 0.0009999999974752427, "raw_times": [0.09412200000724624, 0.09317200004943516, 0.0941720000469104, 0.09289299998727074, 0.09675199999037432], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09680300001946307, "peak_bytes": 137396224, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5824980437173508e-08, "mae_k": 1.5615324144846454e-08, "mse_q": 2.488090249374306e-15, "mse_k": 2.425079044911585e-15, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-31T20:01:22Z", "run": "eaf2d47fcdc24840a68457c07da24ae9", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H8_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09332299998732196, "p50": 0.09457300001258773, "p90": 0.10429200000316996, "mean": 0.09875060001149905, "iqr": 0.009959999999864522, "raw_times": [0.10723300005111014, 0.09332299998732196, 0.09457300001258773, 0.10429200000316996, 0.09433200000330544], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 0.0961519999691518, "peak_bytes": 12648448, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5683587761827766e-08, "mae_k": 1.574532682013796e-08, "mse_q": 2.4310271220254415e-15, "mse_k": 2.4601385856313877e-15, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-31T20:01:22Z", "run": "eaf2d47fcdc24840a68457c07da24ae9", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H32_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0932629999965684, "p50": 0.09406200001649268, "p90": 0.09426200000461904, "mean": 0.09393640000325831, "iqr": 0.0008400000410802022, "raw_times": [0.0932629999965684, 0.09406200001649268, 0.09342199996353884, 0.09467300003507262, 0.09426200000461904], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09624299997312846, "peak_bytes": 25198592, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5835009747888762e-08, "mae_k": 1.572560215379326e-08, "mse_q": 2.478222950813504e-15, "mse_k": 2.4541699679685603e-15, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-31T20:01:22Z", "run": "eaf2d47fcdc24840a68457c07da24ae9", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H32_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09137300003203563, "p50": 0.09416199998213415, "p90": 0.09422200002973113, "mean": 0.09578819999660482, "iqr": 0.00042000004896181053, "raw_times": [0.09422200002973113, 0.09137300003203563, 0.09416199998213415, 0.10538199995835384, 0.09380199998076932], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09458300002052056, "peak_bytes": 50397184, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5876850056883995e-08, "mae_k": 1.5927410501603845e-08, "mse_q": 2.504224532953606e-15, "mse_k": 2.503892919554756e-15, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-31T20:01:22Z", "run": "eaf2d47fcdc24840a68457c07da24ae9", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H8_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09282199999915974, "p50": 0.09416199998213415, "p90": 0.09431199998743978, "mean": 0.09398199999850476, "iqr": 0.00039999997625272954, "raw_times": [0.09416199998213415, 0.0947020000126031, 0.09431199998743978, 0.09282199999915974, 0.09391200001118705], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09807300000375108, "peak_bytes": 25362432, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5820052823301012e-08, "mae_k": 1.580205122309053e-08, "mse_q": 2.4876468276264184e-15, "mse_k": 2.4866062476507165e-15, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-31T20:01:22Z", "run": "eaf2d47fcdc24840a68457c07da24ae9", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H8_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09392299995170106, "p50": 0.09451299996499074, "p90": 0.09455299999672206, "mean": 0.09461079997663546, "iqr": 0.00017000002117129043, "raw_times": [0.09392299995170106, 0.09568199999421267, 0.09451299996499074, 0.09438299997555077, 0.09455299999672206], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09652299996787406, "peak_bytes": 50593792, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5823172105911e-08, "mae_k": 1.582038855474366e-08, "mse_q": 2.464257071579175e-15, "mse_k": 2.4775099608301526e-15, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-31T20:01:22Z", "run": "eaf2d47fcdc24840a68457c07da24ae9", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H32_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0935829999662019, "p50": 0.0949919999584381, "p90": 0.09520300000076531, "mean": 0.09494659997244526, "iqr": 0.0008610000463704637, "raw_times": [0.09434199995439485, 0.09661299998242612, 0.0935829999662019, 0.09520300000076531, 0.0949919999584381], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09693200001947844, "peak_bytes": 100794368, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5888783622131086e-08, "mae_k": 1.5861886026868888e-08, "mse_q": 2.4766798685418433e-15, "mse_k": 2.475923891636419e-15, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-31T20:01:22Z", "run": "eaf2d47fcdc24840a68457c07da24ae9", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H32_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09247200000572775, "p50": 0.09415199997420132, "p90": 0.09440299999141644, "mean": 0.09443839999221382, "iqr": 0.001340999972399004, "raw_times": [0.09440299999141644, 0.09415199997420132, 0.09810299997070615, 0.09306200001901743, 0.09247200000572775], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09771300000238625, "peak_bytes": 201588736, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5826390864503992e-08, "mae_k": 1.5792682717119533e-08, "mse_q": 2.480465258783123e-15, "mse_k": 2.475580631534544e-15, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-31T20:01:22Z", "run": "eaf2d47fcdc24840a68457c07da24ae9", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H8_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09252199998854849, "p50": 0.093122000009771, "p90": 0.09490200000072946, "mean": 0.09375020000561562, "iqr": 0.0023690000148235413, "raw_times": [0.093122000009771, 0.09252199998854849, 0.09490200000072946, 0.09567200004312326, 0.09253299998590592], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09691200000361277, "peak_bytes": 101449728, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.592899323554775e-08, "mae_k": 1.5925031959795888e-08, "mse_q": 2.50783882253954e-15, "mse_k": 2.5015648494992274e-15, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-31T20:01:22Z", "run": "eaf2d47fcdc24840a68457c07da24ae9", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H8_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09136299996725938, "p50": 0.09425199999668621, "p90": 0.0960129999612036, "mean": 0.0991567999903964, "iqr": 0.002309999956651154, "raw_times": [0.09370300000455245, 0.09136299996725938, 0.12045300002228032, 0.09425199999668621, 0.0960129999612036], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09500200002321435, "peak_bytes": 202375168, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.590209919299923e-08, "mae_k": 1.590130160877834e-08, "mse_q": 2.4971026799330918e-15, "mse_k": 2.506967649153289e-15, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-31T20:01:22Z", "run": "eaf2d47fcdc24840a68457c07da24ae9", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H32_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.26061699998081167, "p50": 0.26556599999594255, "p90": 0.26563699998405355, "mean": 0.2649027999950704, "iqr": 0.001249999968422344, "raw_times": [0.26061699998081167, 0.26830699999891294, 0.2643870000156312, 0.26563699998405355, 0.26556599999594255], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.26123600002847525, "peak_bytes": 403177472, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5847520629108658e-08, "mae_k": 1.5862454461057496e-08, "mse_q": 2.4917348203881045e-15, "mse_k": 2.491306009958557e-15, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-31T20:01:22Z", "run": "eaf2d47fcdc24840a68457c07da24ae9", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H32_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8460020000029544, "p50": 0.8488419999821417, "p90": 0.8517510000274342, "mean": 0.8514335999961986, "iqr": 0.004409000041505351, "raw_times": [0.8632309999825338, 0.8488419999821417, 0.8517510000274342, 0.8473419999859289, 0.8460020000029544], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8540019999827564, "peak_bytes": 806354944, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.585225106737198e-08, "mae_k": 1.581303976649906e-08, "mse_q": 2.4866460581992374e-15, "mse_k": 2.4721545950211372e-15, "ref": "rotary_torch"}, "err": null}
diff --git a/rotary/impls/cells/benchmark.py b/rotary/impls/cells/benchmark.py
index 94d42ad7f4a476fdf06a84f3b75776b234ecb848..7f6fcb6c184c6611acf24218eb91d13889eaa08e 100644
--- a/rotary/impls/cells/benchmark.py
+++ b/rotary/impls/cells/benchmark.py
@@ -4,6 +4,7 @@
 #     "numpy",
 #     "torch==2.8.0",
 #     "kernels-benchmark-tools",
+#     "kernels",
 # ]
 #
 # [tool.uv.sources]
@@ -12,46 +13,36 @@
 import torch
 import sys
 from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+from kernels import get_kernel
 
+# Load the rotary kernel
+rotary = get_kernel("kernels-community/rotary")
 
-def apply_rotary_torch(x1, x2, cos, sin, conj=False):
-    """Reference rotary implementation."""
-    if not conj:
-        out1 = x1 * cos - x2 * sin
-        out2 = x1 * sin + x2 * cos
-    else:
-        out1 = x1 * cos + x2 * sin
-        out2 = -x1 * sin + x2 * cos
-    return out1, out2
 
-
-def torch_rotary(query, key, cos, sin, conj=False):
+def hf_kernels_rotary(query, key, cos, sin, conj=False):
     rotary_dim = cos.shape[-1]
 
-    # Clone inputs to avoid modifying them
+    # Clone to avoid modifying inputs
     q_out = query.clone()
     k_out = key.clone()
 
     # Apply rotation to query
     q1 = q_out[..., :rotary_dim]
     q2 = q_out[..., rotary_dim : 2 * rotary_dim]
-    q_out_1, q_out_2 = apply_rotary_torch(q1, q2, cos, sin, conj)
-    q_out[..., :rotary_dim] = q_out_1
-    q_out[..., rotary_dim : 2 * rotary_dim] = q_out_2
+    rotary.apply_rotary(q1, q2, cos, sin, q1, q2, conj)
 
     # Apply rotation to key
     k1 = k_out[..., :rotary_dim]
     k2 = k_out[..., rotary_dim : 2 * rotary_dim]
-    k_out_1, k_out_2 = apply_rotary_torch(k1, k2, cos, sin, conj)
-    k_out[..., :rotary_dim] = k_out_1
-    k_out[..., rotary_dim : 2 * rotary_dim] = k_out_2
+    rotary.apply_rotary(k1, k2, cos, sin, k1, k2, conj)
 
     return q_out, k_out
 
 
 run_benchmark(
     kernel_type=KernelTypeEnum.ROTARY,
-    impl_name="torch_eager",
-    impl_tags={"family": "pytorch", "backend": "eager"},
-    impl_func=torch_rotary,
+    impl_name="hf_kernels_rotary",
+    impl_tags={"family": "hf-kernels", "backend": "cuda"},
+    impl_func=hf_kernels_rotary,
+    dtype="float32",
 )
\ No newline at end of file
diff --git a/rotary/impls/hf_kernels_rotary.html b/rotary/impls/hf_kernels_rotary.html
index 727c47ed5f26fae785282a5942ab8c0a053fcc96..330944cc91943bb2b53930714f4af06cb0ca72b7 100644
--- a/rotary/impls/hf_kernels_rotary.html
+++ b/rotary/impls/hf_kernels_rotary.html
@@ -4106,11 +4106,12 @@ body[data-tool="eraser"] .main-content {
 <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> | 
-Cell: nv | 0.21s
+Cell: nv | 0.23s
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
 <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
-<a href="https://github.com/huggingface/kernels-uvnotes/blob/main/rotary/impls/hf_kernels_rotary.md" target="_blank" class="github-btn">GitHub</a>
+<a href="https://github.com/huggingface/kernels-benchmarks/blob/main/rotary/impls/hf_kernels_rotary.md" target="_blank" class="github-btn">GitHub</a>
+<a href="https://huggingface.co/kernels-community/rotary" target="_blank" class="hf-btn">🤗 HF</a>
 </div>
 <div id="code-nv" class="cell-code" data-lines="2">
 <div class="code-wrap">
@@ -4122,7 +4123,7 @@ Cell: nv | 0.21s
 </div>
 </div>
 <div id="output-nv" class="cell-output">
-<div class="cell-stdout"><pre class="stdout-text">Thu Oct 30 15:52:23 2025       
+<div class="cell-stdout"><pre class="stdout-text">Fri Oct 31 20:00:00 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -4131,7 +4132,7 @@ Cell: nv | 0.21s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   30C    P0             76W /  350W |       0MiB /  46068MiB |     11%      Default |
+| N/A   32C    P0            101W /  350W |       0MiB /  46068MiB |    100%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -4155,11 +4156,12 @@ Cell: nv | 0.21s
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
 </span> | 
-Cell: benchmark | 8.39s
+Cell: benchmark | 4.67s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
-<a href="https://github.com/huggingface/kernels-uvnotes/blob/main/rotary/impls/hf_kernels_rotary.md" target="_blank" class="github-btn">GitHub</a>
+<a href="https://github.com/huggingface/kernels-benchmarks/blob/main/rotary/impls/hf_kernels_rotary.md" target="_blank" class="github-btn">GitHub</a>
+<a href="https://huggingface.co/kernels-community/rotary" target="_blank" class="hf-btn">🤗 HF</a>
 </div>
 <div id="code-benchmark" class="cell-code" data-lines="48">
 <div class="code-wrap">
@@ -4225,23 +4227,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     437.951us      1890.33%     437.951us     437.951us             1  
-                                      hf_kernels_rotary        12.22%     256.435us        99.67%       2.092ms       2.092ms       0.000us         0.00%      24.448us      24.448us             1  
-                          _rotary_dba7d1e::apply_rotary         2.70%      56.773us         5.22%     109.533us      18.255us      16.128us        69.61%      16.128us       2.688us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.128us        69.61%      16.128us       2.688us             6  
-                                            aten::clone         2.06%      43.312us        79.20%       1.663ms     277.110us       0.000us         0.00%       8.320us       1.387us             6  
-                                            aten::copy_         2.16%      45.349us        74.16%       1.557ms     259.469us       7.040us        30.39%       8.320us       1.387us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.040us        30.39%       7.040us       1.173us             6  
-                                Activity Buffer Request        68.35%       1.435ms        68.35%       1.435ms       1.435ms       1.280us         5.52%       1.280us       1.280us             1  
-                                    aten::empty_strided         2.98%      62.532us         2.98%      62.532us      10.422us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         3.65%      76.672us         3.65%      76.672us      12.779us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.33%      48.990us         3.04%      63.719us       5.310us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.70%      14.729us         0.70%      14.729us       1.227us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         2.51%      52.760us         2.51%      52.760us       8.793us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.33%       6.840us         0.33%       6.840us       6.840us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     426.303us      1837.51%     426.303us     426.303us             1  
+                                      hf_kernels_rotary        12.40%     260.056us        99.66%       2.090ms       2.090ms       0.000us         0.00%      24.480us      24.480us             1  
+                          _rotary_dba7d1e::apply_rotary         2.75%      57.674us         5.07%     106.315us      17.719us      16.128us        69.52%      16.128us       2.688us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.128us        69.52%      16.128us       2.688us             6  
+                                            aten::clone         2.13%      44.582us        79.34%       1.664ms     277.309us       0.000us         0.00%       8.352us       1.392us             6  
+                                            aten::copy_         1.84%      38.562us        74.44%       1.561ms     260.165us       7.072us        30.48%       8.352us       1.392us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.072us        30.48%       7.072us       1.179us             6  
+                                Activity Buffer Request        69.01%       1.447ms        69.01%       1.447ms       1.447ms       1.280us         5.52%       1.280us       1.280us             1  
+                                    aten::empty_strided         2.78%      58.281us         2.78%      58.281us       9.713us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         3.58%      75.121us         3.58%      75.121us      12.520us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.14%      44.780us         2.85%      59.790us       4.983us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.72%      15.010us         0.72%      15.010us       1.251us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.32%      48.641us         2.32%      48.641us       8.107us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.34%       7.100us         0.34%       7.100us       7.100us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.099ms
-Self CUDA time total: 23.168us
+Self CPU time total: 2.097ms
+Self CUDA time total: 23.200us
 
 
 
@@ -4251,23 +4253,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     347.903us      1449.48%     347.903us     347.903us             1  
-                                      hf_kernels_rotary         8.54%     161.773us        99.74%       1.890ms       1.890ms       0.000us         0.00%      25.314us      25.314us             1  
-                          _rotary_dba7d1e::apply_rotary         2.18%      41.260us         4.61%      87.431us      14.572us      16.194us        67.47%      16.194us       2.699us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.194us        67.47%      16.194us       2.699us             6  
-                                            aten::clone         1.21%      22.941us        84.30%       1.597ms     266.206us       0.000us         0.00%       9.120us       1.520us             6  
-                                            aten::copy_         2.05%      38.809us        81.33%       1.541ms     256.844us       7.808us        32.53%       9.120us       1.520us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.808us        32.53%       7.808us       1.301us             6  
-                                Activity Buffer Request        76.43%       1.448ms        76.43%       1.448ms       1.448ms       1.312us         5.47%       1.312us       1.312us             1  
-                                    aten::empty_strided         1.75%      33.230us         1.75%      33.230us       5.538us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         2.85%      54.092us         2.85%      54.092us       9.015us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.79%      33.972us         2.29%      43.382us       3.615us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.50%       9.410us         0.50%       9.410us       0.784us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         2.44%      46.171us         2.44%      46.171us       7.695us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.26%       4.990us         0.26%       4.990us       4.990us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     340.796us      1422.00%     340.796us     340.796us             1  
+                                      hf_kernels_rotary         9.48%     182.026us        99.73%       1.916ms       1.916ms       0.000us         0.00%      25.278us      25.278us             1  
+                          _rotary_dba7d1e::apply_rotary         2.22%      42.701us         4.40%      84.531us      14.088us      16.159us        67.42%      16.159us       2.693us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.159us        67.42%      16.159us       2.693us             6  
+                                            aten::clone         1.41%      27.120us        83.58%       1.605ms     267.570us       0.000us         0.00%       9.119us       1.520us             6  
+                                            aten::copy_         2.02%      38.773us        80.45%       1.545ms     257.555us       7.807us        32.58%       9.119us       1.520us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.807us        32.58%       7.807us       1.301us             6  
+                                Activity Buffer Request        75.56%       1.451ms        75.56%       1.451ms       1.451ms       1.312us         5.47%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.72%      32.970us         1.72%      32.970us       5.495us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         2.88%      55.291us         2.88%      55.291us       9.215us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.76%      33.749us         2.27%      43.642us       3.637us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.52%       9.893us         0.52%       9.893us       0.824us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.18%      41.830us         2.18%      41.830us       6.972us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.27%       5.161us         0.27%       5.161us       5.161us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.895ms
-Self CUDA time total: 24.002us
+Self CPU time total: 1.921ms
+Self CUDA time total: 23.966us
 
 
 
@@ -4277,23 +4279,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     344.799us      1421.56%     344.799us     344.799us             1  
-                                      hf_kernels_rotary         8.36%     157.652us        99.72%       1.880ms       1.880ms       0.000us         0.00%      25.535us      25.535us             1  
-                          _rotary_dba7d1e::apply_rotary         2.20%      41.393us         4.58%      86.433us      14.405us      16.479us        67.94%      16.479us       2.747us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.479us        67.94%      16.479us       2.747us             6  
-                                            aten::clone         1.19%      22.449us        84.54%       1.594ms     265.688us       0.000us         0.00%       9.056us       1.509us             6  
-                                            aten::copy_         1.98%      37.391us        81.51%       1.537ms     256.168us       7.776us        32.06%       9.056us       1.509us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.776us        32.06%       7.776us       1.296us             6  
-                                Activity Buffer Request        76.55%       1.443ms        76.55%       1.443ms       1.443ms       1.280us         5.28%       1.280us       1.280us             1  
-                                    aten::empty_strided         1.84%      34.673us         1.84%      34.673us       5.779us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         2.98%      56.200us         2.98%      56.200us       9.367us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.75%      32.991us         2.23%      42.120us       3.510us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.48%       9.129us         0.48%       9.129us       0.761us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         2.39%      45.040us         2.39%      45.040us       7.507us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.28%       5.250us         0.28%       5.250us       5.250us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     339.421us      1391.81%     339.421us     339.421us             1  
+                                      hf_kernels_rotary         9.18%     172.926us        99.76%       1.879ms       1.879ms       0.000us         0.00%      25.699us      25.699us             1  
+                          _rotary_dba7d1e::apply_rotary         2.20%      41.409us         4.51%      85.000us      14.167us      16.481us        67.58%      16.481us       2.747us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.481us        67.58%      16.481us       2.747us             6  
+                                            aten::clone         1.46%      27.581us        83.73%       1.577ms     262.862us       0.000us         0.00%       9.218us       1.536us             6  
+                                            aten::copy_         1.97%      37.091us        80.45%       1.515ms     252.563us       7.906us        32.42%       9.218us       1.536us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.906us        32.42%       7.906us       1.318us             6  
+                                Activity Buffer Request        75.71%       1.426ms        75.71%       1.426ms       1.426ms       1.312us         5.38%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.82%      34.210us         1.82%      34.210us       5.702us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         2.77%      52.231us         2.77%      52.231us       8.705us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.80%      33.892us         2.33%      43.952us       3.663us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.53%      10.060us         0.53%      10.060us       0.838us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.31%      43.591us         2.31%      43.591us       7.265us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.24%       4.550us         0.24%       4.550us       4.550us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.886ms
-Self CUDA time total: 24.255us
+Self CPU time total: 1.884ms
+Self CUDA time total: 24.387us
 
 
 
@@ -4303,23 +4305,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     344.221us      1225.16%     344.221us     344.221us             1  
-                                      hf_kernels_rotary         7.87%     162.633us        99.75%       2.060ms       2.060ms       0.000us         0.00%      29.824us      29.824us             1  
-                          _rotary_dba7d1e::apply_rotary         1.96%      40.432us         4.15%      85.752us      14.292us      17.728us        63.10%      17.728us       2.955us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      17.728us        63.10%      17.728us       2.955us             6  
-                                            aten::clone         1.05%      21.772us        85.59%       1.768ms     294.674us       0.000us         0.00%      12.096us       2.016us             6  
-                                            aten::copy_         1.75%      36.131us        82.94%       1.713ms     285.533us      10.368us        36.90%      12.096us       2.016us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.368us        36.90%      10.368us       1.728us             6  
-                                Activity Buffer Request        69.12%       1.428ms        69.12%       1.428ms       1.428ms       1.728us         6.15%       1.728us       1.728us             1  
-                                    aten::empty_strided         1.60%      33.071us         1.60%      33.071us       5.512us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        12.07%     249.233us        12.07%     249.233us      41.539us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.63%      33.600us         2.13%      43.960us       3.663us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.50%      10.360us         0.50%      10.360us       0.863us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         2.19%      45.320us         2.19%      45.320us       7.553us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.25%       5.220us         0.25%       5.220us       5.220us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     353.466us      1252.36%     353.466us     353.466us             1  
+                                      hf_kernels_rotary         8.35%     176.747us        99.76%       2.111ms       2.111ms       0.000us         0.00%      30.048us      30.048us             1  
+                          _rotary_dba7d1e::apply_rotary         2.17%      45.850us         4.21%      89.000us      14.833us      17.664us        62.59%      17.664us       2.944us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      17.664us        62.59%      17.664us       2.944us             6  
+                                            aten::clone         1.36%      28.714us        85.13%       1.802ms     300.274us       0.000us         0.00%      12.384us       2.064us             6  
+                                            aten::copy_         1.83%      38.751us        82.20%       1.740ms     289.944us      10.560us        37.41%      12.384us       2.064us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.560us        37.41%      10.560us       1.760us             6  
+                                Activity Buffer Request        67.60%       1.431ms        67.60%       1.431ms       1.431ms       1.824us         6.46%       1.824us       1.824us             1  
+                                    aten::empty_strided         1.57%      33.269us         1.57%      33.269us       5.545us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        12.77%     270.306us        12.77%     270.306us      45.051us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.59%      33.568us         2.07%      43.911us       3.659us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.49%      10.343us         0.49%      10.343us       0.862us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.04%      43.150us         2.04%      43.150us       7.192us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.24%       5.130us         0.24%       5.130us       5.130us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.066ms
-Self CUDA time total: 28.096us
+Self CPU time total: 2.116ms
+Self CUDA time total: 28.224us
 
 
 
@@ -4329,23 +4331,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     345.758us      1419.83%     345.758us     345.758us             1  
-                                      hf_kernels_rotary         7.72%     159.843us        99.76%       2.064ms       2.064ms       0.000us         0.00%      25.664us      25.664us             1  
-                          _rotary_dba7d1e::apply_rotary         1.98%      40.892us         4.09%      84.633us      14.106us      16.544us        67.94%      16.544us       2.757us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.544us        67.94%      16.544us       2.757us             6  
-                                            aten::clone         1.14%      23.531us        85.80%       1.775ms     295.882us       0.000us         0.00%       9.120us       1.520us             6  
-                                            aten::copy_         1.76%      36.431us        83.03%       1.718ms     286.337us       7.808us        32.06%       9.120us       1.520us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.808us        32.06%       7.808us       1.301us             6  
-                                Activity Buffer Request        69.77%       1.444ms        69.77%       1.444ms       1.444ms       1.312us         5.39%       1.312us       1.312us             1  
-                                    aten::empty_strided         1.63%      33.740us         1.63%      33.740us       5.623us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        11.50%     237.923us        11.50%     237.923us      39.654us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.68%      34.750us         2.15%      44.540us       3.712us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.47%       9.790us         0.47%       9.790us       0.816us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         2.11%      43.741us         2.11%      43.741us       7.290us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.24%       4.890us         0.24%       4.890us       4.890us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     351.740us      1444.46%     351.740us     351.740us             1  
+                                      hf_kernels_rotary         8.68%     176.155us        99.77%       2.024ms       2.024ms       0.000us         0.00%      25.663us      25.663us             1  
+                          _rotary_dba7d1e::apply_rotary         2.27%      46.099us         4.32%      87.680us      14.613us      16.479us        67.67%      16.479us       2.747us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.479us        67.67%      16.479us       2.747us             6  
+                                            aten::clone         1.42%      28.832us        84.62%       1.717ms     286.091us       0.000us         0.00%       9.184us       1.531us             6  
+                                            aten::copy_         1.86%      37.831us        81.49%       1.653ms     275.519us       7.872us        32.33%       9.184us       1.531us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.872us        32.33%       7.872us       1.312us             6  
+                                Activity Buffer Request        70.03%       1.420ms        70.03%       1.420ms       1.420ms       1.312us         5.39%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.71%      34.601us         1.71%      34.601us       5.767us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         9.60%     194.784us         9.60%     194.784us      32.464us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.63%      33.102us         2.14%      43.512us       3.626us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.51%      10.410us         0.51%      10.410us       0.867us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.05%      41.581us         2.05%      41.581us       6.930us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.23%       4.660us         0.23%       4.660us       4.660us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.069ms
-Self CUDA time total: 24.352us
+Self CPU time total: 2.029ms
+Self CUDA time total: 24.351us
 
 
 
@@ -4355,23 +4357,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     375.259us      1340.31%     375.259us     375.259us             1  
-                                      hf_kernels_rotary         7.92%     165.422us        99.76%       2.085ms       2.085ms       0.000us         0.00%      29.790us      29.790us             1  
-                          _rotary_dba7d1e::apply_rotary         2.01%      42.019us         4.24%      88.630us      14.772us      17.566us        62.74%      17.566us       2.928us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      17.566us        62.74%      17.566us       2.928us             6  
-                                            aten::clone         1.13%      23.560us        85.51%       1.787ms     297.810us       0.000us         0.00%      12.224us       2.037us             6  
-                                            aten::copy_         1.86%      38.872us        82.84%       1.731ms     288.508us      10.432us        37.26%      12.224us       2.037us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.432us        37.26%      10.432us       1.739us             6  
-                                Activity Buffer Request        68.75%       1.437ms        68.75%       1.437ms       1.437ms       1.792us         6.40%       1.792us       1.792us             1  
-                                    aten::empty_strided         1.54%      32.252us         1.54%      32.252us       5.375us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        12.23%     255.474us        12.23%     255.474us      42.579us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.66%      34.672us         2.10%      43.902us       3.658us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.44%       9.230us         0.44%       9.230us       0.769us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         2.23%      46.611us         2.23%      46.611us       7.769us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.24%       4.930us         0.24%       4.930us       4.930us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     349.111us      1238.38%     349.111us     349.111us             1  
+                                      hf_kernels_rotary        23.24%     192.013us        99.32%     820.571us     820.571us       0.000us         0.00%      30.015us      30.015us             1  
+                          _rotary_dba7d1e::apply_rotary         5.42%      44.795us        10.63%      87.866us      14.644us      17.632us        62.54%      17.632us       2.939us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      17.632us        62.54%      17.632us       2.939us             6  
+                                            aten::clone         2.69%      22.223us        60.09%     496.442us      82.740us       0.000us         0.00%      12.383us       2.064us             6  
+                                            aten::copy_         4.60%      38.000us        53.48%     441.890us      73.648us      10.559us        37.46%      12.383us       2.064us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.559us        37.46%      10.559us       1.760us             6  
+                                Activity Buffer Request        26.48%     218.816us        26.48%     218.816us     218.816us       1.824us         6.47%       1.824us       1.824us             1  
+                                    aten::empty_strided         3.91%      32.329us         3.91%      32.329us       5.388us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        22.40%     185.074us        22.40%     185.074us      30.846us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.04%      33.410us         5.36%      44.250us       3.688us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.31%      10.840us         1.31%      10.840us       0.903us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.21%      43.071us         5.21%      43.071us       7.178us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.68%       5.641us         0.68%       5.641us       5.641us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.090ms
-Self CUDA time total: 27.998us
+Self CPU time total: 826.212us
+Self CUDA time total: 28.191us
 
 
 
@@ -4381,23 +4383,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     346.557us       858.83%     346.557us     346.557us             1  
-                                      hf_kernels_rotary         7.80%     160.642us        99.76%       2.055ms       2.055ms       0.000us         0.00%      43.200us      43.200us             1  
-                          _rotary_dba7d1e::apply_rotary         2.00%      41.122us         4.23%      87.123us      14.521us      23.424us        58.05%      23.424us       3.904us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      23.424us        58.05%      23.424us       3.904us             6  
-                                            aten::clone         1.11%      22.900us        85.69%       1.765ms     294.130us       0.000us         0.00%      19.776us       3.296us             6  
-                                            aten::copy_         1.80%      37.091us        82.95%       1.708ms     284.737us      16.928us        41.95%      19.776us       3.296us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.928us        41.95%      16.928us       2.821us             6  
-                                Activity Buffer Request        70.02%       1.442ms        70.02%       1.442ms       1.442ms       2.848us         7.06%       2.848us       2.848us             1  
-                                    aten::empty_strided         1.62%      33.460us         1.62%      33.460us       5.577us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        11.13%     229.194us        11.13%     229.194us      38.199us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.60%      33.049us         2.04%      42.051us       3.504us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.44%       9.002us         0.44%       9.002us       0.750us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         2.23%      46.001us         2.23%      46.001us       7.667us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.24%       4.950us         0.24%       4.950us       4.950us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     344.984us       852.93%     344.984us     344.984us             1  
+                                      hf_kernels_rotary        22.02%     168.975us        99.39%     762.759us     762.759us       0.000us         0.00%      43.263us      43.263us             1  
+                          _rotary_dba7d1e::apply_rotary         5.75%      44.162us        11.18%      85.802us      14.300us      23.456us        57.99%      23.456us       3.909us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      23.456us        57.99%      23.456us       3.909us             6  
+                                            aten::clone         2.91%      22.350us        60.45%     463.932us      77.322us       0.000us         0.00%      19.807us       3.301us             6  
+                                            aten::copy_         4.98%      38.249us        53.45%     410.170us      68.362us      16.991us        42.01%      19.807us       3.301us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.991us        42.01%      16.991us       2.832us             6  
+                                Activity Buffer Request        24.55%     188.395us        24.55%     188.395us     188.395us       2.816us         6.96%       2.816us       2.816us             1  
+                                    aten::empty_strided         4.09%      31.412us         4.09%      31.412us       5.235us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        23.91%     183.526us        23.91%     183.526us      30.588us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.40%      33.790us         5.74%      44.050us       3.671us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.34%      10.260us         1.34%      10.260us       0.855us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.43%      41.640us         5.43%      41.640us       6.940us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.61%       4.661us         0.61%       4.661us       4.661us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.060ms
-Self CUDA time total: 40.352us
+Self CPU time total: 767.420us
+Self CUDA time total: 40.447us
 
 
 
@@ -4407,23 +4409,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     349.374us       446.91%     349.374us     349.374us             1  
-                                      hf_kernels_rotary         8.00%     163.391us        99.76%       2.039ms       2.039ms       0.000us         0.00%      90.720us      90.720us             1  
-                                            aten::clone         1.09%      22.181us        85.39%       1.745ms     290.833us       0.000us         0.00%      52.224us       8.704us             6  
-                                            aten::copy_         1.85%      37.761us        82.69%       1.690ms     281.650us      39.680us        50.76%      52.224us       8.704us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      39.680us        50.76%      39.680us       6.613us             6  
-                          _rotary_dba7d1e::apply_rotary         2.10%      42.834us         4.25%      86.883us      14.481us      38.496us        49.24%      38.496us       6.416us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      38.496us        49.24%      38.496us       6.416us             6  
-                                Activity Buffer Request        69.78%       1.426ms        69.78%       1.426ms       1.426ms      12.544us        16.05%      12.544us      12.544us             1  
-                                    aten::empty_strided         1.61%      32.920us         1.61%      32.920us       5.487us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        11.06%     226.094us        11.06%     226.094us      37.682us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.62%      33.171us         2.12%      43.331us       3.611us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.50%      10.160us         0.50%      10.160us       0.847us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         2.16%      44.049us         2.16%      44.049us       7.341us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.24%       4.951us         0.24%       4.951us       4.951us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     347.453us       442.64%     347.453us     347.453us             1  
+                                      hf_kernels_rotary        20.37%     160.826us        99.39%     784.751us     784.751us       0.000us         0.00%      91.040us      91.040us             1  
+                                            aten::clone         2.83%      22.340us        62.44%     492.983us      82.164us       0.000us         0.00%      52.865us       8.811us             6  
+                                            aten::copy_         4.65%      36.740us        55.30%     436.663us      72.777us      40.321us        51.37%      52.865us       8.811us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      40.321us        51.37%      40.321us       6.720us             6  
+                          _rotary_dba7d1e::apply_rotary         5.74%      45.350us        11.00%      86.891us      14.482us      38.175us        48.63%      38.175us       6.362us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      38.175us        48.63%      38.175us       6.362us             6  
+                                Activity Buffer Request        27.86%     219.946us        27.86%     219.946us     219.946us      12.544us        15.98%      12.544us      12.544us             1  
+                                    aten::empty_strided         4.30%      33.980us         4.30%      33.980us       5.663us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        22.79%     179.977us        22.79%     179.977us      29.996us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.35%      34.361us         5.58%      44.051us       3.671us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.23%       9.690us         1.23%       9.690us       0.808us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.26%      41.541us         5.26%      41.541us       6.924us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.61%       4.830us         0.61%       4.830us       4.830us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.044ms
-Self CUDA time total: 78.176us
+Self CPU time total: 789.581us
+Self CUDA time total: 78.496us
 
 
 
@@ -4433,23 +4435,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     333.879us       824.19%     333.879us     333.879us             1  
-                                      hf_kernels_rotary        18.73%     154.483us        99.41%     820.134us     820.134us       0.000us         0.00%      43.327us      43.327us             1  
-                          _rotary_dba7d1e::apply_rotary         4.89%      40.361us        10.02%      82.702us      13.784us      23.422us        57.82%      23.422us       3.904us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      23.422us        57.82%      23.422us       3.904us             6  
-                                            aten::clone         2.46%      20.259us        65.56%     540.868us      90.145us       0.000us         0.00%      19.905us       3.317us             6  
-                                            aten::copy_         4.70%      38.811us        59.16%     488.099us      81.350us      17.088us        42.18%      19.905us       3.317us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.088us        42.18%      17.088us       2.848us             6  
-                                Activity Buffer Request        27.39%     225.944us        27.39%     225.944us     225.944us       2.817us         6.95%       2.817us       2.817us             1  
-                                    aten::empty_strided         3.94%      32.510us         3.94%      32.510us       5.418us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        27.07%     223.344us        27.07%     223.344us      37.224us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.93%      32.394us         5.10%      42.081us       3.507us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.17%       9.687us         1.17%       9.687us       0.807us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         5.13%      42.341us         5.13%      42.341us       7.057us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.59%       4.860us         0.59%       4.860us       4.860us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     347.324us       858.06%     347.324us     347.324us             1  
+                                      hf_kernels_rotary         8.65%     173.958us        99.77%       2.007ms       2.007ms       0.000us         0.00%      43.325us      43.325us             1  
+                          _rotary_dba7d1e::apply_rotary         2.18%      43.910us         4.21%      84.770us      14.128us      23.423us        57.87%      23.423us       3.904us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      23.423us        57.87%      23.423us       3.904us             6  
+                                            aten::clone         1.35%      27.211us        84.83%       1.706ms     284.405us       0.000us         0.00%      19.902us       3.317us             6  
+                                            aten::copy_         1.92%      38.681us        81.76%       1.645ms     274.138us      17.055us        42.13%      19.902us       3.317us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.055us        42.13%      17.055us       2.842us             6  
+                                Activity Buffer Request        70.68%       1.422ms        70.68%       1.422ms       1.422ms       2.847us         7.03%       2.847us       2.847us             1  
+                                    aten::empty_strided         1.71%      34.392us         1.71%      34.392us       5.732us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         9.16%     184.363us         9.16%     184.363us      30.727us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.62%      32.593us         2.08%      41.861us       3.488us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.46%       9.268us         0.46%       9.268us       0.772us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.03%      40.860us         2.03%      40.860us       6.810us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.23%       4.670us         0.23%       4.670us       4.670us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 824.994us
-Self CUDA time total: 40.510us
+Self CPU time total: 2.012ms
+Self CUDA time total: 40.478us
 
 
 
@@ -4459,23 +4461,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     338.778us       450.33%     338.778us     338.778us             1  
-                                      hf_kernels_rotary        18.40%     151.937us        99.39%     820.824us     820.824us       0.000us         0.00%      85.723us      85.723us             1  
-                                            aten::clone         2.47%      20.430us        65.45%     540.538us      90.090us       0.000us         0.00%      47.293us       7.882us             6  
-                                            aten::copy_         4.41%      36.400us        59.08%     487.928us      81.321us      36.798us        48.92%      47.293us       7.882us             6  
-                          _rotary_dba7d1e::apply_rotary         4.89%      40.390us        10.51%      86.760us      14.460us      38.430us        51.08%      38.430us       6.405us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      38.430us        51.08%      38.430us       6.405us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      36.798us        48.92%      36.798us       6.133us             6  
-                                Activity Buffer Request        27.74%     229.134us        27.74%     229.134us     229.134us      10.495us        13.95%      10.495us      10.495us             1  
-                                    aten::empty_strided         3.90%      32.180us         3.90%      32.180us       5.363us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        26.93%     222.394us        26.93%     222.394us      37.066us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.90%      32.180us         5.04%      41.589us       3.466us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.14%       9.409us         1.14%       9.409us       0.784us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         5.61%      46.370us         5.61%      46.370us       7.728us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.61%       5.040us         0.61%       5.040us       5.040us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     361.785us       476.45%     361.785us     361.785us             1  
+                                      hf_kernels_rotary         8.64%     176.662us        99.77%       2.040ms       2.040ms       0.000us         0.00%      86.685us      86.685us             1  
+                                            aten::clone         1.40%      28.682us        84.64%       1.731ms     288.486us       0.000us         0.00%      47.871us       7.979us             6  
+                                            aten::copy_         1.80%      36.737us        81.55%       1.668ms     277.962us      37.119us        48.88%      47.871us       7.979us             6  
+                          _rotary_dba7d1e::apply_rotary         2.24%      45.910us         4.34%      88.820us      14.803us      38.814us        51.12%      38.814us       6.469us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      38.814us        51.12%      38.814us       6.469us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      37.119us        48.88%      37.119us       6.187us             6  
+                                Activity Buffer Request        70.82%       1.448ms        70.82%       1.448ms       1.448ms      10.752us        14.16%      10.752us      10.752us             1  
+                                    aten::empty_strided         1.69%      34.462us         1.69%      34.462us       5.744us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.93%     182.677us         8.93%     182.677us      30.446us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.66%      33.994us         2.15%      43.925us       3.660us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.49%       9.931us         0.49%       9.931us       0.828us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.10%      42.910us         2.10%      42.910us       7.152us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.23%       4.670us         0.23%       4.670us       4.670us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 825.864us
-Self CUDA time total: 75.228us
+Self CPU time total: 2.045ms
+Self CUDA time total: 75.933us
 
 
 
@@ -4485,23 +4487,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     338.815us       244.98%     338.815us     338.815us             1  
-                                      hf_kernels_rotary        17.96%     152.299us        99.45%     843.474us     843.474us       0.000us         0.00%     161.823us     161.823us             1  
-                                            aten::clone         2.40%      20.339us        66.32%     562.460us      93.743us       0.000us         0.00%     102.176us      17.029us             6  
-                                            aten::copy_         4.27%      36.251us        60.21%     510.629us      85.105us      78.656us        56.87%     102.176us      17.029us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      78.656us        56.87%      78.656us      13.109us             6  
-                          _rotary_dba7d1e::apply_rotary         4.86%      41.202us        10.23%      86.763us      14.460us      59.647us        43.13%      59.647us       9.941us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      59.647us        43.13%      59.647us       9.941us             6  
-                                Activity Buffer Request        30.37%     257.584us        30.37%     257.584us     257.584us      23.520us        17.01%      23.520us      23.520us             1  
-                                    aten::empty_strided         3.71%      31.492us         3.71%      31.492us       5.249us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        25.56%     216.794us        25.56%     216.794us      36.132us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.89%      32.951us         4.95%      41.952us       3.496us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.06%       9.001us         1.06%       9.001us       0.750us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         5.37%      45.561us         5.37%      45.561us       7.594us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.55%       4.640us         0.55%       4.640us       4.640us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     373.629us       268.97%     373.629us     373.629us             1  
+                                      hf_kernels_rotary         8.95%     179.578us        99.78%       2.002ms       2.002ms       0.000us         0.00%     162.750us     162.750us             1  
+                                            aten::clone         1.48%      29.597us        83.94%       1.684ms     280.680us       0.000us         0.00%     102.944us      17.157us             6  
+                                            aten::copy_         1.82%      36.553us        80.73%       1.620ms     269.962us      79.104us        56.95%     102.944us      17.157us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      79.104us        56.95%      79.104us      13.184us             6  
+                          _rotary_dba7d1e::apply_rotary         2.30%      46.131us         4.57%      91.713us      15.285us      59.806us        43.05%      59.806us       9.968us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      59.806us        43.05%      59.806us       9.968us             6  
+                                Activity Buffer Request        69.91%       1.403ms        69.91%       1.403ms       1.403ms      23.840us        17.16%      23.840us      23.840us             1  
+                                    aten::empty_strided         1.73%      34.712us         1.73%      34.712us       5.785us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         9.00%     180.563us         9.00%     180.563us      30.094us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.75%      35.198us         2.31%      46.409us       3.867us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.56%      11.211us         0.56%      11.211us       0.934us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.27%      45.582us         2.27%      45.582us       7.597us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.22%       4.510us         0.22%       4.510us       4.510us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 848.114us
-Self CUDA time total: 138.303us
+Self CPU time total: 2.006ms
+Self CUDA time total: 138.910us
 
 
 
@@ -4511,23 +4513,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary        12.84%     152.812us        71.89%     855.575us     855.575us       0.000us         0.00%     769.625us     769.625us             1  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     710.234us       101.16%     710.234us     710.234us             1  
-                                            aten::clone         1.76%      21.001us        48.07%     572.021us      95.337us       0.000us         0.00%     572.987us      95.498us             6  
-                                            aten::copy_         3.15%      37.471us        43.65%     519.450us      86.575us     505.436us        71.99%     572.987us      95.498us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us     505.436us        71.99%     505.436us      84.239us             6  
-                          _rotary_dba7d1e::apply_rotary         3.42%      40.722us         7.33%      87.262us      14.544us     196.638us        28.01%     196.638us      32.773us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us     196.638us        28.01%     196.638us      32.773us             6  
-                                Activity Buffer Request        21.90%     260.665us        21.90%     260.665us     260.665us      67.551us         9.62%      67.551us      67.551us             1  
-                                    aten::empty_strided         2.65%      31.570us         2.65%      31.570us       5.262us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        18.60%     221.314us        18.60%     221.314us      36.886us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.82%      33.601us         3.65%      43.480us       3.623us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.83%       9.879us         0.83%       9.879us       0.823us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         3.91%      46.540us         3.91%      46.540us       7.757us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        28.11%     334.485us        28.11%     334.485us     334.485us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         7.56%     177.196us        86.68%       2.032ms       2.032ms       0.000us         0.00%     778.402us     778.402us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     717.248us       101.07%     717.248us     717.248us             1  
+                                            aten::clone         1.23%      28.772us        72.98%       1.711ms     285.141us       0.000us         0.00%     578.626us      96.438us             6  
+                                            aten::copy_         1.64%      38.341us        70.23%       1.646ms     274.415us     509.889us        71.85%     578.626us      96.438us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us     509.889us        71.85%     509.889us      84.982us             6  
+                          _rotary_dba7d1e::apply_rotary         2.34%      54.801us         4.25%      99.591us      16.598us     199.776us        28.15%     199.776us      33.296us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us     199.776us        28.15%     199.776us      33.296us             6  
+                                Activity Buffer Request        60.86%       1.427ms        60.86%       1.427ms       1.427ms      68.737us         9.69%      68.737us      68.737us             1  
+                                    aten::empty_strided         1.52%      35.581us         1.52%      35.581us       5.930us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         7.74%     181.435us         7.74%     181.435us      30.239us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.41%      33.151us         1.89%      44.330us       3.694us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.48%      11.179us         0.48%      11.179us       0.932us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         1.91%      44.790us         1.91%      44.790us       7.465us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        13.32%     312.348us        13.32%     312.348us     312.348us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.190ms
-Self CUDA time total: 702.074us
+Self CPU time total: 2.344ms
+Self CUDA time total: 709.665us
 
 
 
@@ -4537,23 +4539,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     340.957us      1280.69%     340.957us     340.957us             1  
-                                      hf_kernels_rotary        17.85%     154.192us        99.45%     858.915us     858.915us       0.000us         0.00%      27.935us      27.935us             1  
-                          _rotary_dba7d1e::apply_rotary         4.82%      41.593us        10.09%      87.173us      14.529us      18.719us        70.31%      18.719us       3.120us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      18.719us        70.31%      18.719us       3.120us             6  
-                                            aten::clone         2.51%      21.701us        66.67%     575.779us      95.963us       0.000us         0.00%       9.216us       1.536us             6  
-                                            aten::copy_         4.05%      34.978us        60.54%     522.828us      87.138us       7.904us        29.69%       9.216us       1.536us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.904us        29.69%       7.904us       1.317us             6  
-                                Activity Buffer Request        30.68%     265.004us        30.68%     265.004us     265.004us       1.312us         4.93%       1.312us       1.312us             1  
-                                    aten::empty_strided         3.62%      31.250us         3.62%      31.250us       5.208us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        25.80%     222.846us        25.80%     222.846us      37.141us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.77%      32.522us         4.84%      41.771us       3.481us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.07%       9.249us         1.07%       9.249us       0.771us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         5.28%      45.580us         5.28%      45.580us       7.597us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.55%       4.760us         0.55%       4.760us       4.760us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     349.629us      1313.11%     349.629us     349.629us             1  
+                                      hf_kernels_rotary         8.75%     174.875us        99.76%       1.994ms       1.994ms       0.000us         0.00%      27.938us      27.938us             1  
+                          _rotary_dba7d1e::apply_rotary         2.16%      43.200us         4.40%      87.900us      14.650us      18.754us        70.43%      18.754us       3.126us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      18.754us        70.43%      18.754us       3.126us             6  
+                                            aten::clone         1.44%      28.720us        84.48%       1.688ms     281.365us       0.000us         0.00%       9.184us       1.531us             6  
+                                            aten::copy_         1.82%      36.432us        81.36%       1.626ms     271.003us       7.872us        29.57%       9.184us       1.531us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.872us        29.57%       7.872us       1.312us             6  
+                                Activity Buffer Request        70.53%       1.410ms        70.53%       1.410ms       1.410ms       1.312us         4.93%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.67%      33.452us         1.67%      33.452us       5.575us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         9.01%     180.083us         9.01%     180.083us      30.014us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.63%      32.560us         2.14%      42.684us       3.557us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.51%      10.124us         0.51%      10.124us       0.844us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.24%      44.700us         2.24%      44.700us       7.450us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.24%       4.780us         0.24%       4.780us       4.780us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 863.675us
-Self CUDA time total: 26.623us
+Self CPU time total: 1.998ms
+Self CUDA time total: 26.626us
 
 
 
@@ -4563,23 +4565,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     331.838us      1247.93%     331.838us     331.838us             1  
-                                      hf_kernels_rotary        18.40%     149.763us        99.33%     808.424us     808.424us       0.000us         0.00%      27.871us      27.871us             1  
-                          _rotary_dba7d1e::apply_rotary         5.12%      41.640us        10.68%      86.941us      14.490us      18.879us        71.00%      18.879us       3.147us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      18.879us        71.00%      18.879us       3.147us             6  
-                                            aten::clone         2.56%      20.830us        65.24%     531.000us      88.500us       0.000us         0.00%       8.992us       1.499us             6  
-                                            aten::copy_         4.49%      36.550us        58.98%     480.009us      80.001us       7.712us        29.00%       8.992us       1.499us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.712us        29.00%       7.712us       1.285us             6  
-                                Activity Buffer Request        28.18%     229.375us        28.18%     229.375us     229.375us       1.280us         4.81%       1.280us       1.280us             1  
-                                    aten::empty_strided         3.71%      30.161us         3.71%      30.161us       5.027us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        26.30%     214.084us        26.30%     214.084us      35.681us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.92%      31.890us         5.00%      40.720us       3.393us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.08%       8.830us         1.08%       8.830us       0.736us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         5.57%      45.301us         5.57%      45.301us       7.550us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.67%       5.440us         0.67%       5.440us       5.440us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     344.698us      1282.22%     344.698us     344.698us             1  
+                                      hf_kernels_rotary        22.61%     152.757us        99.23%     670.538us     670.538us       0.000us         0.00%      28.195us      28.195us             1  
+                          _rotary_dba7d1e::apply_rotary         6.64%      44.870us        12.97%      87.630us      14.605us      19.009us        70.71%      19.009us       3.168us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      19.009us        70.71%      19.009us       3.168us             6  
+                                            aten::clone         3.38%      22.839us        57.25%     386.869us      64.478us       0.000us         0.00%       9.186us       1.531us             6  
+                                            aten::copy_         5.63%      38.041us        49.11%     331.829us      55.305us       7.874us        29.29%       9.186us       1.531us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.874us        29.29%       7.874us       1.312us             6  
+                                Activity Buffer Request        16.48%     111.363us        16.48%     111.363us     111.363us       1.312us         4.88%       1.312us       1.312us             1  
+                                    aten::empty_strided         4.77%      32.201us         4.77%      32.201us       5.367us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        27.00%     182.425us        27.00%     182.425us      30.404us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.90%      33.085us         6.41%      43.282us       3.607us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.51%      10.197us         1.51%      10.197us       0.850us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         6.33%      42.760us         6.33%      42.760us       7.127us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.77%       5.200us         0.77%       5.200us       5.200us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 813.864us
-Self CUDA time total: 26.591us
+Self CPU time total: 675.738us
+Self CUDA time total: 26.883us
 
 
 
@@ -4589,23 +4591,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     353.852us      1157.89%     353.852us     353.852us             1  
-                                      hf_kernels_rotary         7.66%     156.034us        99.77%       2.033ms       2.033ms       0.000us         0.00%      32.320us      32.320us             1  
-                          _rotary_dba7d1e::apply_rotary         2.04%      41.512us         4.26%      86.762us      14.460us      20.159us        65.97%      20.159us       3.360us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      20.159us        65.97%      20.159us       3.360us             6  
-                                            aten::clone         1.10%      22.431us        85.66%       1.746ms     290.955us       0.000us         0.00%      12.161us       2.027us             6  
-                                            aten::copy_         2.23%      45.431us        82.85%       1.688ms     281.408us      10.401us        34.03%      12.161us       2.027us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.401us        34.03%      10.401us       1.734us             6  
-                                Activity Buffer Request        70.07%       1.428ms        70.07%       1.428ms       1.428ms       1.760us         5.76%       1.760us       1.760us             1  
-                                    aten::empty_strided         1.71%      34.849us         1.71%      34.849us       5.808us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        10.54%     214.913us        10.54%     214.913us      35.819us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.68%      34.241us         2.20%      44.770us       3.731us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.52%      10.529us         0.52%      10.529us       0.877us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         2.22%      45.250us         2.22%      45.250us       7.542us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.23%       4.770us         0.23%       4.770us       4.770us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     350.004us      1141.75%     350.004us     350.004us             1  
+                                      hf_kernels_rotary        19.05%     154.214us        99.36%     804.261us     804.261us       0.000us         0.00%      32.414us      32.414us             1  
+                          _rotary_dba7d1e::apply_rotary         5.47%      44.240us        10.98%      88.910us      14.818us      20.064us        65.45%      20.064us       3.344us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      20.064us        65.45%      20.064us       3.344us             6  
+                                            aten::clone         3.02%      24.421us        63.80%     516.433us      86.072us       0.000us         0.00%      12.350us       2.058us             6  
+                                            aten::copy_         4.66%      37.732us        56.69%     458.901us      76.483us      10.591us        34.55%      12.350us       2.058us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.591us        34.55%      10.591us       1.765us             6  
+                                Activity Buffer Request        29.69%     240.306us        29.69%     240.306us     240.306us       1.759us         5.74%       1.759us       1.759us             1  
+                                    aten::empty_strided         4.09%      33.111us         4.09%      33.111us       5.518us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        22.34%     180.863us        22.34%     180.863us      30.144us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.15%      33.594us         5.52%      44.704us       3.725us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.37%      11.110us         1.37%      11.110us       0.926us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.52%      44.670us         5.52%      44.670us       7.445us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.64%       5.201us         0.64%       5.201us       5.201us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.038ms
-Self CUDA time total: 30.560us
+Self CPU time total: 809.462us
+Self CUDA time total: 30.655us
 
 
 
@@ -4615,23 +4617,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     367.612us       860.51%     367.612us     367.612us             1  
-                                      hf_kernels_rotary         7.69%     158.003us        99.76%       2.050ms       2.050ms       0.000us         0.00%      45.568us      45.568us             1  
-                          _rotary_dba7d1e::apply_rotary         2.04%      41.961us         4.25%      87.391us      14.565us      25.759us        60.30%      25.759us       4.293us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      25.759us        60.30%      25.759us       4.293us             6  
-                                            aten::clone         1.11%      22.799us        84.82%       1.743ms     290.528us       0.000us         0.00%      19.809us       3.301us             6  
-                                            aten::copy_         1.88%      38.712us        82.12%       1.688ms     281.267us      16.961us        39.70%      19.809us       3.301us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.961us        39.70%      16.961us       2.827us             6  
-                                Activity Buffer Request        69.69%       1.432ms        69.69%       1.432ms       1.432ms       2.848us         6.67%       2.848us       2.848us             1  
-                                    aten::empty_strided         1.59%      32.771us         1.59%      32.771us       5.462us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        10.54%     216.613us        10.54%     216.613us      36.102us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.51%      51.572us         3.00%      61.672us       5.139us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.49%      10.100us         0.49%      10.100us       0.842us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         2.21%      45.430us         2.21%      45.430us       7.572us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.24%       4.849us         0.24%       4.849us       4.849us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     350.355us       822.64%     350.355us     350.355us             1  
+                                      hf_kernels_rotary        19.55%     155.605us        99.35%     790.981us     790.981us       0.000us         0.00%      45.469us      45.469us             1  
+                          _rotary_dba7d1e::apply_rotary         5.55%      44.191us        11.02%      87.731us      14.622us      25.565us        60.03%      25.565us       4.261us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      25.565us        60.03%      25.565us       4.261us             6  
+                                            aten::clone         2.81%      22.389us        63.13%     502.593us      83.766us       0.000us         0.00%      19.904us       3.317us             6  
+                                            aten::copy_         4.90%      39.043us        56.13%     446.833us      74.472us      17.024us        39.97%      19.904us       3.317us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.024us        39.97%      17.024us       2.837us             6  
+                                Activity Buffer Request        28.37%     225.886us        28.37%     225.886us     225.886us       2.880us         6.76%       2.880us       2.880us             1  
+                                    aten::empty_strided         4.19%      33.371us         4.19%      33.371us       5.562us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        22.85%     181.904us        22.85%     181.904us      30.317us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.29%      34.142us         5.66%      45.052us       3.754us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.37%      10.910us         1.37%      10.910us       0.909us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.47%      43.540us         5.47%      43.540us       7.257us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.65%       5.140us         0.65%       5.140us       5.140us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.055ms
-Self CUDA time total: 42.720us
+Self CPU time total: 796.121us
+Self CUDA time total: 42.589us
 
 
 
@@ -4641,23 +4643,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     347.614us      1135.14%     347.614us     347.614us             1  
-                                      hf_kernels_rotary         7.64%     156.781us        99.77%       2.046ms       2.046ms       0.000us         0.00%      32.383us      32.383us             1  
-                          _rotary_dba7d1e::apply_rotary         2.01%      41.122us         4.16%      85.392us      14.232us      20.223us        66.04%      20.223us       3.370us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      20.223us        66.04%      20.223us       3.370us             6  
-                                            aten::clone         1.11%      22.841us        85.79%       1.759ms     293.223us       0.000us         0.00%      12.160us       2.027us             6  
-                                            aten::copy_         1.81%      37.030us        83.06%       1.703ms     283.910us      10.400us        33.96%      12.160us       2.027us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.400us        33.96%      10.400us       1.733us             6  
-                                Activity Buffer Request        70.68%       1.449ms        70.68%       1.449ms       1.449ms       1.760us         5.75%       1.760us       1.760us             1  
-                                    aten::empty_strided         1.61%      33.040us         1.61%      33.040us       5.507us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        10.58%     216.984us        10.58%     216.984us      36.164us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.70%      34.784us         2.17%      44.532us       3.711us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.48%       9.748us         0.48%       9.748us       0.812us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         2.16%      44.270us         2.16%      44.270us       7.378us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.23%       4.760us         0.23%       4.760us       4.760us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     344.951us      1133.59%     344.951us     344.951us             1  
+                                      hf_kernels_rotary        19.05%     153.418us        99.42%     800.680us     800.680us       0.000us         0.00%      32.125us      32.125us             1  
+                          _rotary_dba7d1e::apply_rotary         5.43%      43.718us        10.83%      87.180us      14.530us      20.095us        66.04%      20.095us       3.349us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      20.095us        66.04%      20.095us       3.349us             6  
+                                            aten::clone         2.75%      22.180us        64.20%     517.012us      86.169us       0.000us         0.00%      12.030us       2.005us             6  
+                                            aten::copy_         4.82%      38.813us        57.22%     460.802us      76.800us      10.335us        33.96%      12.030us       2.005us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.335us        33.96%      10.335us       1.722us             6  
+                                Activity Buffer Request        30.13%     242.666us        30.13%     242.666us     242.666us       1.695us         5.57%       1.695us       1.695us             1  
+                                    aten::empty_strided         4.23%      34.030us         4.23%      34.030us       5.672us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        22.27%     179.323us        22.27%     179.323us      29.887us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.11%      33.131us         5.35%      43.070us       3.589us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.23%       9.939us         1.23%       9.939us       0.828us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.40%      43.462us         5.40%      43.462us       7.244us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.58%       4.660us         0.58%       4.660us       4.660us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.051ms
-Self CUDA time total: 30.623us
+Self CPU time total: 805.340us
+Self CUDA time total: 30.430us
 
 
 
@@ -4667,23 +4669,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     328.444us       771.23%     328.444us     328.444us             1  
-                                      hf_kernels_rotary        18.84%     150.934us        99.38%     796.084us     796.084us       0.000us         0.00%      45.403us      45.403us             1  
-                          _rotary_dba7d1e::apply_rotary         5.06%      40.529us        10.59%      84.811us      14.135us      25.693us        60.33%      25.693us       4.282us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      25.693us        60.33%      25.693us       4.282us             6  
-                                            aten::clone         2.49%      19.929us        64.90%     519.868us      86.645us       0.000us         0.00%      19.710us       3.285us             6  
-                                            aten::copy_         4.41%      35.321us        58.57%     469.148us      78.191us      16.894us        39.67%      19.710us       3.285us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.894us        39.67%      16.894us       2.816us             6  
-                                Activity Buffer Request        27.59%     221.013us        27.59%     221.013us     221.013us       2.816us         6.61%       2.816us       2.816us             1  
-                                    aten::empty_strided         3.84%      30.791us         3.84%      30.791us       5.132us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        26.57%     212.814us        26.57%     212.814us      35.469us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.92%      31.361us         5.05%      40.471us       3.373us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.14%       9.110us         1.14%       9.110us       0.759us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         5.53%      44.282us         5.53%      44.282us       7.380us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.62%       4.951us         0.62%       4.951us       4.951us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     358.905us       840.15%     358.905us     358.905us             1  
+                                      hf_kernels_rotary        15.26%     159.123us        99.55%       1.038ms       1.038ms       0.000us         0.00%      45.598us      45.598us             1  
+                          _rotary_dba7d1e::apply_rotary         4.27%      44.490us         8.42%      87.790us      14.632us      25.600us        59.93%      25.600us       4.267us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      25.600us        59.93%      25.600us       4.267us             6  
+                                            aten::clone         2.23%      23.211us        71.54%     746.059us     124.343us       0.000us         0.00%      19.998us       3.333us             6  
+                                            aten::copy_         3.70%      38.572us        65.96%     687.817us     114.636us      17.119us        40.07%      19.998us       3.333us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.119us        40.07%      17.119us       2.853us             6  
+                                Activity Buffer Request        44.90%     468.242us        44.90%     468.242us     468.242us       2.879us         6.74%       2.879us       2.879us             1  
+                                    aten::empty_strided         3.36%      35.031us         3.36%      35.031us       5.838us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        17.36%     181.003us        17.36%     181.003us      30.167us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.32%      34.604us         4.33%      45.135us       3.761us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.01%      10.531us         1.01%      10.531us       0.878us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         4.15%      43.300us         4.15%      43.300us       7.217us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.45%       4.700us         0.45%       4.700us       4.700us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 801.035us
-Self CUDA time total: 42.587us
+Self CPU time total: 1.043ms
+Self CUDA time total: 42.719us
 
 
 
@@ -4693,23 +4695,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     338.910us       380.70%     338.910us     338.910us             1  
-                                      hf_kernels_rotary        14.14%     150.935us        99.54%       1.062ms       1.062ms       0.000us         0.00%     104.734us     104.734us             1  
-                                            aten::clone         2.00%      21.371us        73.53%     784.703us     130.784us       0.000us         0.00%      63.775us      10.629us             6  
-                                            aten::copy_         3.58%      38.219us        68.59%     731.952us     121.992us      48.063us        53.99%      63.775us      10.629us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      48.063us        53.99%      48.063us       8.010us             6  
-                          _rotary_dba7d1e::apply_rotary         3.85%      41.059us         8.05%      85.950us      14.325us      40.959us        46.01%      40.959us       6.826us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      40.959us        46.01%      40.959us       6.826us             6  
-                                Activity Buffer Request        44.86%     478.699us        44.86%     478.699us     478.699us      15.712us        17.65%      15.712us      15.712us             1  
-                                    aten::empty_strided         2.94%      31.380us         2.94%      31.380us       5.230us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        20.15%     215.034us        20.15%     215.034us      35.839us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.96%      31.591us         3.81%      40.690us       3.391us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.85%       9.099us         0.85%       9.099us       0.758us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         4.21%      44.891us         4.21%      44.891us       7.482us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.46%       4.900us         0.46%       4.900us       4.900us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     383.638us       432.19%     383.638us     383.638us             1  
+                                      hf_kernels_rotary        19.20%     158.364us        99.38%     819.611us     819.611us       0.000us         0.00%     103.870us     103.870us             1  
+                                            aten::clone         2.74%      22.581us        61.51%     507.313us      84.552us       0.000us         0.00%      63.135us      10.522us             6  
+                                            aten::copy_         4.83%      39.811us        54.76%     451.622us      75.270us      48.031us        54.11%      63.135us      10.522us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      48.031us        54.11%      48.031us       8.005us             6  
+                          _rotary_dba7d1e::apply_rotary         5.49%      45.243us        13.16%     108.504us      18.084us      40.735us        45.89%      40.735us       6.789us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      40.735us        45.89%      40.735us       6.789us             6  
+                                Activity Buffer Request        27.50%     226.825us        27.50%     226.825us     226.825us      15.104us        17.02%      15.104us      15.104us             1  
+                                    aten::empty_strided         4.01%      33.110us         4.01%      33.110us       5.518us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        22.43%     184.986us        22.43%     184.986us      30.831us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.25%      35.021us         5.51%      45.430us       3.786us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.26%      10.409us         1.26%      10.409us       0.867us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         7.67%      63.261us         7.67%      63.261us      10.543us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.62%       5.141us         0.62%       5.141us       5.141us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.067ms
-Self CUDA time total: 89.022us
+Self CPU time total: 824.752us
+Self CUDA time total: 88.766us
 
 
 
@@ -4719,23 +4721,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     336.057us       230.21%     336.057us     336.057us             1  
-                                      hf_kernels_rotary        18.72%     149.775us        99.40%     795.224us     795.224us       0.000us         0.00%     169.949us     169.949us             1  
-                                            aten::clone         2.52%      20.180us        65.04%     520.348us      86.725us       0.000us         0.00%     106.527us      17.755us             6  
-                                            aten::copy_         4.49%      35.890us        58.61%     468.868us      78.145us      82.559us        56.55%     106.527us      17.755us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      82.559us        56.55%      82.559us      13.760us             6  
-                          _rotary_dba7d1e::apply_rotary         5.12%      40.981us        10.49%      83.942us      13.990us      63.422us        43.45%      63.422us      10.570us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      63.422us        43.45%      63.422us      10.570us             6  
-                                Activity Buffer Request        27.82%     222.544us        27.82%     222.544us     222.544us      23.968us        16.42%      23.968us      23.968us             1  
-                                    aten::empty_strided         3.91%      31.300us         3.91%      31.300us       5.217us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        26.30%     210.434us        26.30%     210.434us      35.072us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.94%      31.518us         5.14%      41.159us       3.430us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.21%       9.641us         1.21%       9.641us       0.803us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         5.37%      42.961us         5.37%      42.961us       7.160us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.60%       4.790us         0.60%       4.790us       4.790us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     359.259us       247.18%     359.259us     359.259us             1  
+                                      hf_kernels_rotary        19.06%     158.337us        99.39%     825.781us     825.781us       0.000us         0.00%     168.829us     168.829us             1  
+                                            aten::clone         2.83%      23.549us        64.09%     532.493us      88.749us       0.000us         0.00%     105.470us      17.578us             6  
+                                            aten::copy_         4.58%      38.013us        57.29%     475.972us      79.329us      81.982us        56.41%     105.470us      17.578us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      81.982us        56.41%      81.982us      13.664us             6  
+                          _rotary_dba7d1e::apply_rotary         5.47%      45.451us        10.86%      90.251us      15.042us      63.359us        43.59%      63.359us      10.560us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      63.359us        43.59%      63.359us      10.560us             6  
+                                Activity Buffer Request        31.29%     259.966us        31.29%     259.966us     259.966us      23.488us        16.16%      23.488us      23.488us             1  
+                                    aten::empty_strided         3.97%      32.972us         3.97%      32.972us       5.495us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        21.42%     177.993us        21.42%     177.993us      29.665us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.19%      34.839us         5.38%      44.700us       3.725us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.19%       9.861us         1.19%       9.861us       0.822us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.39%      44.800us         5.39%      44.800us       7.467us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.61%       5.100us         0.61%       5.100us       5.100us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 800.014us
-Self CUDA time total: 145.981us
+Self CPU time total: 830.881us
+Self CUDA time total: 145.341us
 
 
 
@@ -4745,23 +4747,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     339.836us       451.90%     339.836us     339.836us             1  
-                                      hf_kernels_rotary        18.57%     150.269us        99.38%     804.154us     804.154us       0.000us         0.00%      81.986us      81.986us             1  
-                          _rotary_dba7d1e::apply_rotary         4.99%      40.401us        10.49%      84.862us      14.144us      41.601us        55.32%      41.601us       6.933us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      41.601us        55.32%      41.601us       6.933us             6  
-                                            aten::clone         2.54%      20.532us        64.81%     524.439us      87.406us       0.000us         0.00%      40.385us       6.731us             6  
-                                            aten::copy_         4.41%      35.708us        58.24%     471.217us      78.536us      33.601us        44.68%      40.385us       6.731us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      33.601us        44.68%      33.601us       5.600us             6  
-                                Activity Buffer Request        27.71%     224.174us        27.71%     224.174us     224.174us       6.784us         9.02%       6.784us       6.784us             1  
-                                    aten::empty_strided         4.04%      32.690us         4.04%      32.690us       5.448us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        26.12%     211.335us        26.12%     211.335us      35.223us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.32%      34.924us         5.51%      44.584us       3.715us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.19%       9.660us         1.19%       9.660us       0.805us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         5.49%      44.461us         5.49%      44.461us       7.410us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.62%       4.981us         0.62%       4.981us       4.981us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     385.725us       509.05%     385.725us     385.725us             1  
+                                      hf_kernels_rotary         8.62%     176.456us        99.78%       2.043ms       2.043ms       0.000us         0.00%      82.558us      82.558us             1  
+                          _rotary_dba7d1e::apply_rotary         2.32%      47.603us         4.41%      90.273us      15.045us      41.694us        55.02%      41.694us       6.949us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      41.694us        55.02%      41.694us       6.949us             6  
+                                            aten::clone         1.42%      29.000us        84.54%       1.731ms     288.534us       0.000us         0.00%      40.864us       6.811us             6  
+                                            aten::copy_         1.93%      39.552us        80.14%       1.641ms     273.497us      34.080us        44.98%      40.864us       6.811us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      34.080us        44.98%      34.080us       5.680us             6  
+                                Activity Buffer Request        69.16%       1.416ms        69.16%       1.416ms       1.416ms       6.784us         8.95%       6.784us       6.784us             1  
+                                    aten::empty_strided         2.99%      61.221us         2.99%      61.221us      10.204us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         9.05%     185.224us         9.05%     185.224us      30.871us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.69%      34.591us         2.21%      45.260us       3.772us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.52%      10.669us         0.52%      10.669us       0.889us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.08%      42.670us         2.08%      42.670us       7.112us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.22%       4.530us         0.22%       4.530us       4.530us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 809.135us
-Self CUDA time total: 75.202us
+Self CPU time total: 2.048ms
+Self CUDA time total: 75.774us
 
 
 
@@ -4771,23 +4773,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     372.859us       256.14%     372.859us     372.859us             1  
-                                      hf_kernels_rotary        18.64%     161.451us        99.43%     861.125us     861.125us       0.000us         0.00%     169.279us     169.279us             1  
-                                            aten::clone         2.47%      21.401us        63.58%     550.631us      91.772us       0.000us         0.00%     105.373us      17.562us             6  
-                                            aten::copy_         4.30%      37.239us        57.31%     496.359us      82.727us      81.662us        56.10%     105.373us      17.562us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      81.662us        56.10%      81.662us      13.610us             6  
-                          _rotary_dba7d1e::apply_rotary         5.12%      44.341us        12.24%     106.023us      17.671us      63.906us        43.90%      63.906us      10.651us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      63.906us        43.90%      63.906us      10.651us             6  
-                                Activity Buffer Request        28.62%     247.854us        28.62%     247.854us     247.854us      23.711us        16.29%      23.711us      23.711us             1  
-                                    aten::empty_strided         3.80%      32.871us         3.80%      32.871us       5.479us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        24.39%     211.266us        24.39%     211.266us      35.211us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.88%      33.609us         4.97%      43.020us       3.585us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.09%       9.411us         1.09%       9.411us       0.784us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         7.12%      61.682us         7.12%      61.682us      10.280us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.57%       4.969us         0.57%       4.969us       4.969us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     368.925us       253.94%     368.925us     368.925us             1  
+                                      hf_kernels_rotary         8.62%     177.641us        99.74%       2.055ms       2.055ms       0.000us         0.00%     169.118us     169.118us             1  
+                                            aten::clone         1.42%      29.322us        84.62%       1.743ms     290.539us       0.000us         0.00%     105.470us      17.578us             6  
+                                            aten::copy_         1.92%      39.462us        81.52%       1.679ms     279.897us      81.631us        56.19%     105.470us      17.578us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      81.631us        56.19%      81.631us      13.605us             6  
+                          _rotary_dba7d1e::apply_rotary         2.27%      46.683us         4.40%      90.665us      15.111us      63.648us        43.81%      63.648us      10.608us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      63.648us        43.81%      63.648us      10.608us             6  
+                                Activity Buffer Request        70.79%       1.458ms        70.79%       1.458ms       1.458ms      23.839us        16.41%      23.839us      23.839us             1  
+                                    aten::empty_strided         1.68%      34.530us         1.68%      34.530us       5.755us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.81%     181.504us         8.81%     181.504us      30.251us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.62%      33.289us         2.09%      43.080us       3.590us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.48%       9.791us         0.48%       9.791us       0.816us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.13%      43.982us         2.13%      43.982us       7.330us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.26%       5.450us         0.26%       5.450us       5.450us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 866.094us
-Self CUDA time total: 145.568us
+Self CPU time total: 2.060ms
+Self CUDA time total: 145.279us
 
 
 
@@ -4797,23 +4799,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary        13.02%     148.583us        72.32%     825.404us     825.404us       0.000us         0.00%     745.510us     745.510us             1  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     687.015us       101.19%     687.015us     687.015us             1  
-                                            aten::clone         1.76%      20.130us        47.96%     547.368us      91.228us       0.000us         0.00%     556.292us      92.715us             6  
-                                            aten::copy_         3.18%      36.280us        43.26%     493.818us      82.303us     489.699us        72.13%     556.292us      92.715us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us     489.699us        72.13%     489.699us      81.617us             6  
-                          _rotary_dba7d1e::apply_rotary         3.57%      40.732us         7.58%      86.552us      14.425us     189.218us        27.87%     189.218us      31.536us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us     189.218us        27.87%     189.218us      31.536us             6  
-                                Activity Buffer Request        21.89%     249.905us        21.89%     249.905us     249.905us      66.593us         9.81%      66.593us      66.593us             1  
-                                    aten::empty_strided         2.93%      33.420us         2.93%      33.420us       5.570us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        18.19%     207.633us        18.19%     207.633us      34.606us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.92%      33.351us         3.76%      42.901us       3.575us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.84%       9.550us         0.84%       9.550us       0.796us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         4.01%      45.820us         4.01%      45.820us       7.637us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        27.68%     315.986us        27.68%     315.986us     315.986us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary        20.72%     223.838us        78.32%     845.992us     845.992us       0.000us         0.00%     747.476us     747.476us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     688.117us       101.15%     688.117us     688.117us             1  
+                                            aten::clone         2.05%      22.091us        45.23%     488.522us      81.420us       0.000us         0.00%     558.423us      93.070us             6  
+                                            aten::copy_         3.67%      39.650us        40.20%     434.190us      72.365us     491.256us        72.21%     558.423us      93.070us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us     491.256us        72.21%     491.256us      81.876us             6  
+                          _rotary_dba7d1e::apply_rotary         4.18%      45.161us         8.45%      91.252us      15.209us     189.053us        27.79%     189.053us      31.509us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us     189.053us        27.79%     189.053us      31.509us             6  
+                                Activity Buffer Request        19.62%     211.896us        19.62%     211.896us     211.896us      67.167us         9.87%      67.167us      67.167us             1  
+                                    aten::empty_strided         2.98%      32.241us         2.98%      32.241us       5.374us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        16.91%     182.644us        16.91%     182.644us      30.441us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.05%      32.939us         3.92%      42.380us       3.532us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.87%       9.441us         0.87%       9.441us       0.787us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         4.27%      46.091us         4.27%      46.091us       7.682us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        21.68%     234.186us        21.68%     234.186us     234.186us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.141ms
-Self CUDA time total: 678.917us
+Self CPU time total: 1.080ms
+Self CUDA time total: 680.309us
 
 
 
@@ -4823,23 +4825,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         5.26%     153.062us        28.60%     832.074us     832.074us       0.000us         0.00%       2.627ms       2.627ms             1  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us       2.451ms       100.32%       2.451ms       2.451ms             1  
-                                            aten::clone         0.71%      20.751us        18.92%     550.432us      91.739us       0.000us         0.00%       1.403ms     233.752us             6  
-                                            aten::copy_         1.33%      38.628us        17.10%     497.389us      82.898us       1.219ms        49.87%       1.403ms     233.752us             6  
-                          _rotary_dba7d1e::apply_rotary         1.42%      41.449us         2.89%      84.050us      14.008us       1.225ms        50.13%       1.225ms     204.141us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us       1.225ms        50.13%       1.225ms     204.141us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       1.219ms        49.87%       1.219ms     203.112us             6  
-                                Activity Buffer Request         8.62%     250.725us         8.62%     250.725us     250.725us     183.838us         7.52%     183.838us     183.838us             1  
-                                    aten::empty_strided         1.11%      32.292us         1.11%      32.292us       5.382us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         7.15%     208.036us         7.15%     208.036us      34.673us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.18%      34.219us         1.53%      44.530us       3.711us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.35%      10.311us         0.35%      10.311us       0.859us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         1.46%      42.601us         1.46%      42.601us       7.100us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        71.40%       2.077ms        71.40%       2.077ms       2.077ms       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         5.41%     154.946us        27.83%     797.061us     797.061us       0.000us         0.00%       2.625ms       2.625ms             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us       2.453ms       100.31%       2.453ms       2.453ms             1  
+                                            aten::clone         0.79%      22.601us        17.83%     510.683us      85.114us       0.000us         0.00%       1.396ms     232.586us             6  
+                                            aten::copy_         1.43%      40.940us        15.89%     455.120us      75.853us       1.216ms        49.74%       1.396ms     232.586us             6  
+                          _rotary_dba7d1e::apply_rotary         1.59%      45.590us         3.06%      87.640us      14.607us       1.229ms        50.26%       1.229ms     204.885us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us       1.229ms        50.26%       1.229ms     204.885us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       1.216ms        49.74%       1.216ms     202.730us             6  
+                                Activity Buffer Request         7.23%     207.076us         7.23%     207.076us     207.076us     179.136us         7.32%     179.136us     179.136us             1  
+                                    aten::empty_strided         1.15%      32.962us         1.15%      32.962us       5.494us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         7.23%     207.104us         7.23%     207.104us      34.517us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.15%      33.011us         1.53%      43.792us       3.649us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.38%      10.781us         0.38%      10.781us       0.898us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         1.47%      42.050us         1.47%      42.050us       7.008us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        72.17%       2.067ms        72.17%       2.067ms       2.067ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.909ms
-Self CUDA time total: 2.444ms
+Self CPU time total: 2.864ms
+Self CUDA time total: 2.446ms
 
 
 impl                     wl                  p50(ms)  ok
@@ -4848,8 +4850,8 @@ hf_kernels_rotary        cuda_B1_S128_H32_D64_R32     0.09  True
 hf_kernels_rotary        cuda_B1_S128_H8_D128_R64     0.09  True
 hf_kernels_rotary        cuda_B1_S128_H8_D64_R32     0.08  True
 hf_kernels_rotary        cuda_B1_S2048_H32_D128_R64     0.26  True
-hf_kernels_rotary        cuda_B1_S2048_H32_D64_R32     0.09  True
-hf_kernels_rotary        cuda_B1_S2048_H8_D128_R64     0.09  True
+hf_kernels_rotary        cuda_B1_S2048_H32_D64_R32     0.10  True
+hf_kernels_rotary        cuda_B1_S2048_H8_D128_R64     0.10  True
 hf_kernels_rotary        cuda_B1_S2048_H8_D64_R32     0.09  True
 hf_kernels_rotary        cuda_B1_S512_H32_D128_R64     0.09  True
 hf_kernels_rotary        cuda_B1_S512_H32_D64_R32     0.09  True
@@ -4860,7 +4862,7 @@ hf_kernels_rotary        cuda_B2_S128_H32_D64_R32     0.09  True
 hf_kernels_rotary        cuda_B2_S128_H8_D128_R64     0.09  True
 hf_kernels_rotary        cuda_B2_S128_H8_D64_R32     0.09  True
 hf_kernels_rotary        cuda_B2_S2048_H32_D128_R64     0.85  True
-hf_kernels_rotary        cuda_B2_S2048_H32_D64_R32     0.26  True
+hf_kernels_rotary        cuda_B2_S2048_H32_D64_R32     0.27  True
 hf_kernels_rotary        cuda_B2_S2048_H8_D128_R64     0.09  True
 hf_kernels_rotary        cuda_B2_S2048_H8_D64_R32     0.09  True
 hf_kernels_rotary        cuda_B2_S512_H32_D128_R64     0.09  True
@@ -4871,14 +4873,12 @@ hf_kernels_rotary        cuda_B2_S512_H8_D64_R32     0.09  True
 <div class="uv-install-logs" id="uv-logs-benchmark">
 <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 <div class="uv-logs-content" style="display: none;">
-Downloading hf-xet (3.2MiB)
- Downloading hf-xet
-Installed 52 packages in 244ms
+Installed 15 packages in 14ms
 </div>
 </div>
 <div class="cell-stderr">Fetching 5 files:   0%|          | 0/5 [00:00&lt;?, ?it/s]
-Fetching 5 files: 100%|██████████| 5/5 [00:00&lt;00:00, 15.33it/s]
-Fetching 5 files: 100%|██████████| 5/5 [00:00&lt;00:00, 15.31it/s]</div>
+Fetching 5 files:  60%|██████    | 3/5 [00:00&lt;00:00, 28.46it/s]
+Fetching 5 files: 100%|██████████| 5/5 [00:00&lt;00:00,  9.80it/s]</div>
 <div class="cell-artifacts">
 <h4>Artifacts:</h4>
 <a href="artifacts/benchmark/rotary.jsonl" class="artifact" target="_blank">rotary.jsonl</a>
diff --git a/rotary/impls/torch_rotary.html b/rotary/impls/torch_rotary.html
index 27d1faf03e42ae5d5ac730cfd03392b62eb2b62f..d7b34676102680b464b702d7de0525c0d9d460d2 100644
--- a/rotary/impls/torch_rotary.html
+++ b/rotary/impls/torch_rotary.html
@@ -4106,11 +4106,11 @@ body[data-tool="eraser"] .main-content {
 <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> | 
-Cell: nv | 0.21s
+Cell: nv | 0.23s
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
 <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
-<a href="https://github.com/huggingface/kernels-uvnotes/blob/main/rotary/impls/torch_rotary.md" target="_blank" class="github-btn">GitHub</a>
+<a href="https://github.com/huggingface/kernels-benchmarks/blob/main/rotary/impls/torch_rotary.md" target="_blank" class="github-btn">GitHub</a>
 </div>
 <div id="code-nv" class="cell-code" data-lines="2">
 <div class="code-wrap">
@@ -4122,7 +4122,7 @@ Cell: nv | 0.21s
 </div>
 </div>
 <div id="output-nv" class="cell-output">
-<div class="cell-stdout"><pre class="stdout-text">Thu Oct 30 15:52:23 2025       
+<div class="cell-stdout"><pre class="stdout-text">Fri Oct 31 20:00:00 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -4131,7 +4131,7 @@ Cell: nv | 0.21s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   30C    P0             76W /  350W |       0MiB /  46068MiB |     11%      Default |
+| N/A   32C    P0            101W /  350W |       0MiB /  46068MiB |    100%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -4153,13 +4153,13 @@ Cell: nv | 0.21s
 <span class="collapse-indicators">
 <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span> 
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
- <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
+ <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
 </span> | 
-Cell: benchmark | 3.86s
+Cell: benchmark | 7.58s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
-<a href="https://github.com/huggingface/kernels-uvnotes/blob/main/rotary/impls/torch_rotary.md" target="_blank" class="github-btn">GitHub</a>
+<a href="https://github.com/huggingface/kernels-benchmarks/blob/main/rotary/impls/torch_rotary.md" target="_blank" class="github-btn">GitHub</a>
 </div>
 <div id="code-benchmark" class="cell-code" data-lines="57">
 <div class="code-wrap">
@@ -4234,27 +4234,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.038ms      1165.07%       1.038ms       1.038ms             1  
-                                            torch_eager        14.25%     384.344us        99.73%       2.691ms       2.691ms       0.000us         0.00%      90.272us      90.272us             1  
-                                              aten::mul         6.11%     164.889us        10.39%     280.433us      11.685us      46.752us        52.50%      46.752us       1.948us            24  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      46.752us        52.50%      46.752us       1.948us            24  
-                                            aten::copy_         4.15%     111.919us        62.66%       1.690ms      93.917us      29.025us        32.59%      30.240us       1.680us            18  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.306us        25.05%      22.306us       1.859us            12  
-void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.280us        14.91%      13.280us       1.107us            12  
-                                            aten::clone         1.43%      38.559us        61.06%       1.647ms     274.577us       0.000us         0.00%       7.934us       1.322us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.719us         7.54%       6.719us       1.120us             6  
-                                              aten::sub         1.59%      42.770us         2.55%      68.721us      11.454us       6.688us         7.51%       6.688us       1.115us             6  
-                                              aten::add         1.63%      44.070us         2.49%      67.170us      11.195us       6.592us         7.40%       6.592us       1.099us             6  
-                                Activity Buffer Request        53.52%       1.444ms        53.52%       1.444ms       1.444ms       1.215us         1.36%       1.215us       1.215us             1  
-                                    aten::empty_strided         2.14%      57.723us         2.14%      57.723us       9.620us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         2.62%      70.572us         2.62%      70.572us      11.762us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.99%      80.691us         3.82%     103.161us       4.298us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.83%      22.470us         0.83%      22.470us       0.936us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.47%     228.526us         8.47%     228.526us       4.761us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.27%       7.361us         0.27%       7.361us       7.361us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.068ms      1195.27%       1.068ms       1.068ms             1  
+                                            torch_eager        14.00%     388.140us        99.71%       2.764ms       2.764ms       0.000us         0.00%      90.528us      90.528us             1  
+                                              aten::mul         6.16%     170.676us        10.43%     289.217us      12.051us      46.911us        52.52%      46.911us       1.955us            24  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      46.911us        52.52%      46.911us       1.955us            24  
+                                            aten::copy_         4.25%     117.935us        62.65%       1.737ms      96.500us      29.185us        32.68%      30.401us       1.689us            18  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.561us        25.26%      22.561us       1.880us            12  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.216us        14.80%      13.216us       1.101us            12  
+                                            aten::clone         1.62%      44.961us        61.78%       1.713ms     285.451us       0.000us         0.00%       7.840us       1.307us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.624us         7.42%       6.624us       1.104us             6  
+                                              aten::sub         1.59%      44.071us         2.54%      70.301us      11.717us       6.624us         7.42%       6.624us       1.104us             6  
+                                              aten::add         1.26%      34.801us         2.08%      57.721us       9.620us       6.592us         7.38%       6.592us       1.099us             6  
+                                Activity Buffer Request        53.17%       1.474ms        53.17%       1.474ms       1.474ms       1.216us         1.36%       1.216us       1.216us             1  
+                                    aten::empty_strided         2.35%      65.251us         2.35%      65.251us      10.875us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         2.98%      82.752us         2.98%      82.752us      13.792us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.05%      84.591us         4.03%     111.694us       4.654us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.98%      27.103us         0.98%      27.103us       1.129us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.29%     229.882us         8.29%     229.882us       4.789us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.29%       8.120us         0.29%       8.120us       8.120us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.698ms
-Self CUDA time total: 89.057us
+Self CPU time total: 2.772ms
+Self CUDA time total: 89.312us
 
 
 
@@ -4264,27 +4264,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     960.319us      1064.55%     960.319us     960.319us             1  
-                                            torch_eager        12.91%     327.841us        99.79%       2.533ms       2.533ms       0.000us         0.00%      91.361us      91.361us             1  
-                                              aten::mul         6.09%     154.573us        10.36%     263.046us      10.960us      47.616us        52.78%      47.616us       1.984us            24  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      47.616us        52.78%      47.616us       1.984us            24  
-                                            aten::copy_         4.38%     111.264us        65.83%       1.671ms      92.839us      29.313us        32.49%      30.465us       1.692us            18  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.496us        24.94%      22.496us       1.875us            12  
-void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.280us        14.72%      13.280us       1.107us            12  
-                                            aten::clone         1.07%      27.110us        62.73%       1.592ms     265.408us       0.000us         0.00%       7.969us       1.328us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.817us         7.56%       6.817us       1.136us             6  
-                                              aten::sub         1.66%      42.072us         2.63%      66.652us      11.109us       6.688us         7.41%       6.688us       1.115us             6  
-                                              aten::add         1.28%      32.560us         2.18%      55.291us       9.215us       6.592us         7.31%       6.592us       1.099us             6  
-                                Activity Buffer Request        56.87%       1.444ms        56.87%       1.444ms       1.444ms       1.152us         1.28%       1.152us       1.152us             1  
-                                    aten::empty_strided         1.25%      31.671us         1.25%      31.671us       5.278us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         2.12%      53.780us         2.12%      53.780us       8.963us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.76%      70.023us         3.57%      90.653us       3.777us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.81%      20.630us         0.81%      20.630us       0.860us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.59%     218.025us         8.59%     218.025us       4.542us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.21%       5.289us         0.21%       5.289us       5.289us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     960.345us      1063.10%     960.345us     960.345us             1  
+                                            torch_eager        11.94%     304.272us        99.78%       2.543ms       2.543ms       0.000us         0.00%      91.454us      91.454us             1  
+                                              aten::mul         6.19%     157.625us        10.77%     274.398us      11.433us      47.776us        52.89%      47.776us       1.991us            24  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      47.776us        52.89%      47.776us       1.991us            24  
+                                            aten::copy_         4.14%     105.392us        66.58%       1.697ms      94.258us      29.343us        32.48%      30.463us       1.692us            18  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.559us        24.97%      22.559us       1.880us            12  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.215us        14.63%      13.215us       1.101us            12  
+                                            aten::clone         0.97%      24.733us        63.76%       1.625ms     270.825us       0.000us         0.00%       7.904us       1.317us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.784us         7.51%       6.784us       1.131us             6  
+                                              aten::add         1.23%      31.452us         2.12%      54.072us       9.012us       6.623us         7.33%       6.623us       1.104us             6  
+                                              aten::sub         1.53%      39.032us         2.55%      64.964us      10.827us       6.592us         7.30%       6.592us       1.099us             6  
+                                Activity Buffer Request        57.59%       1.468ms        57.59%       1.468ms       1.468ms       1.120us         1.24%       1.120us       1.120us             1  
+                                    aten::empty_strided         1.31%      33.410us         1.31%      33.410us       5.568us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         2.43%      61.963us         2.43%      61.963us      10.327us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.76%      70.222us         3.54%      90.271us       3.761us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.79%      20.049us         0.79%      20.049us       0.835us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.91%     226.937us         8.91%     226.937us       4.728us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.22%       5.590us         0.22%       5.590us       5.590us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.539ms
-Self CUDA time total: 90.209us
+Self CPU time total: 2.548ms
+Self CUDA time total: 90.334us
 
 
 
@@ -4294,27 +4294,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     950.812us      1013.41%     950.812us     950.812us             1  
-                                            torch_eager        12.58%     319.124us        99.78%       2.531ms       2.531ms       0.000us         0.00%      95.135us      95.135us             1  
-                                              aten::mul         6.09%     154.550us        10.34%     262.291us      10.929us      48.671us        51.88%      48.671us       2.028us            24  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      48.671us        51.88%      48.671us       2.028us            24  
-                                            aten::copy_         4.10%     104.029us        66.32%       1.682ms      93.470us      30.783us        32.81%      32.095us       1.783us            18  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.943us        24.45%      22.943us       1.912us            12  
-void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.369us        15.32%      14.369us       1.197us            12  
-                                            aten::clone         1.04%      26.300us        63.34%       1.607ms     267.803us       0.000us         0.00%       9.152us       1.525us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.840us         8.36%       7.840us       1.307us             6  
-                                              aten::sub         1.64%      41.492us         2.64%      66.953us      11.159us       7.199us         7.67%       7.199us       1.200us             6  
-                                              aten::add         1.26%      31.999us         2.14%      54.310us       9.052us       7.170us         7.64%       7.170us       1.195us             6  
-                                Activity Buffer Request        57.64%       1.462ms        57.64%       1.462ms       1.462ms       1.312us         1.40%       1.312us       1.312us             1  
-                                    aten::empty_strided         1.26%      31.840us         1.26%      31.840us       5.307us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         2.05%      52.102us         2.05%      52.102us       8.684us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.68%      67.986us         3.47%      87.958us       3.665us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.79%      19.972us         0.79%      19.972us       0.832us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.65%     219.475us         8.65%     219.475us       4.572us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.22%       5.651us         0.22%       5.651us       5.651us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     989.616us      1051.23%     989.616us     989.616us             1  
+                                            torch_eager        12.09%     307.194us        99.76%       2.536ms       2.536ms       0.000us         0.00%      95.450us      95.450us             1  
+                                              aten::mul         6.35%     161.494us        11.09%     281.865us      11.744us      48.958us        52.01%      48.958us       2.040us            24  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      48.958us        52.01%      48.958us       2.040us            24  
+                                            aten::copy_         4.30%     109.293us        66.10%       1.680ms      93.343us      30.814us        32.73%      32.125us       1.785us            18  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.943us        24.37%      22.943us       1.912us            12  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.367us        15.26%      14.367us       1.197us            12  
+                                            aten::clone         0.97%      24.599us        62.75%       1.595ms     265.823us       0.000us         0.00%       9.182us       1.530us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.871us         8.36%       7.871us       1.312us             6  
+                                              aten::add         1.20%      30.579us         2.08%      52.891us       8.815us       7.199us         7.65%       7.199us       1.200us             6  
+                                              aten::sub         1.49%      37.871us         2.53%      64.231us      10.705us       7.168us         7.61%       7.168us       1.195us             6  
+                                Activity Buffer Request        56.57%       1.438ms        56.57%       1.438ms       1.438ms       1.311us         1.39%       1.311us       1.311us             1  
+                                    aten::empty_strided         1.38%      35.041us         1.38%      35.041us       5.840us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         2.38%      60.441us         2.38%      60.441us      10.074us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.77%      70.298us         3.53%      89.841us       3.743us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.77%      19.543us         0.77%      19.543us       0.814us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         9.50%     241.544us         9.50%     241.544us       5.032us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.24%       6.100us         0.24%       6.100us       6.100us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.537ms
-Self CUDA time total: 93.823us
+Self CPU time total: 2.542ms
+Self CUDA time total: 94.139us
 
 
 
@@ -4324,27 +4324,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     952.670us       942.15%     952.670us     952.670us             1  
-                                            torch_eager        11.55%     312.506us        99.79%       2.701ms       2.701ms       0.000us         0.00%     102.429us     102.429us             1  
-                                              aten::mul         5.68%     153.743us         9.71%     262.695us      10.946us      52.765us        52.18%      52.765us       2.199us            24  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      52.765us        52.18%      52.765us       2.199us            24  
-                                            aten::copy_         3.97%     107.471us        68.61%       1.857ms     103.165us      32.353us        32.00%      33.665us       1.870us            18  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.641us        24.37%      24.641us       2.053us            12  
-void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      15.999us        15.82%      15.999us       1.333us            12  
-                                            aten::clone         1.01%      27.330us        65.76%       1.780ms     296.625us       0.000us         0.00%       9.024us       1.504us             6  
-                                              aten::add         1.21%      32.850us         2.05%      55.600us       9.267us       8.032us         7.94%       8.032us       1.339us             6  
-                                              aten::sub         1.44%      39.082us         2.35%      63.492us      10.582us       7.967us         7.88%       7.967us       1.328us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.712us         7.63%       7.712us       1.285us             6  
-                                Activity Buffer Request        52.99%       1.434ms        52.99%       1.434ms       1.434ms       1.312us         1.30%       1.312us       1.312us             1  
-                                    aten::empty_strided         1.20%      32.420us         1.20%      32.420us       5.403us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         9.27%     250.924us         9.27%     250.924us      41.821us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.56%      69.212us         3.32%      89.782us       3.741us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.76%      20.570us         0.76%      20.570us       0.857us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.14%     220.374us         8.14%     220.374us       4.591us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.21%       5.791us         0.21%       5.791us       5.791us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     928.327us       916.02%     928.327us     928.327us             1  
+                                            torch_eager        12.51%     290.049us        99.77%       2.313ms       2.313ms       0.000us         0.00%     102.689us     102.689us             1  
+                                              aten::mul         6.36%     147.401us        11.12%     257.946us      10.748us      52.800us        52.10%      52.800us       2.200us            24  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      52.800us        52.10%      52.800us       2.200us            24  
+                                            aten::copy_         4.62%     107.204us        65.04%       1.508ms      83.777us      32.415us        31.99%      33.760us       1.876us            18  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.511us        24.19%      24.511us       2.043us            12  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.129us        15.92%      16.129us       1.344us            12  
+                                            aten::clone         0.98%      22.822us        61.74%       1.431ms     238.579us       0.000us         0.00%       9.249us       1.542us             6  
+                                              aten::add         1.37%      31.668us         2.34%      54.320us       9.053us       8.096us         7.99%       8.096us       1.349us             6  
+                                              aten::sub         1.57%      36.291us         2.61%      60.431us      10.072us       8.033us         7.93%       8.033us       1.339us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.904us         7.80%       7.904us       1.317us             6  
+                                Activity Buffer Request        46.02%       1.067ms        46.02%       1.067ms       1.067ms       1.345us         1.33%       1.345us       1.345us             1  
+                                    aten::empty_strided         1.38%      31.940us         1.38%      31.940us       5.323us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        11.71%     271.508us        11.71%     271.508us      45.251us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.99%      69.429us         3.79%      87.781us       3.658us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.79%      18.352us         0.79%      18.352us       0.765us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         9.47%     219.548us         9.47%     219.548us       4.574us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.23%       5.380us         0.23%       5.380us       5.380us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.707ms
-Self CUDA time total: 101.117us
+Self CPU time total: 2.319ms
+Self CUDA time total: 101.344us
 
 
 
@@ -4354,27 +4354,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     987.399us      1051.70%     987.399us     987.399us             1  
-                                            torch_eager        12.37%     335.778us        99.82%       2.710ms       2.710ms       0.000us         0.00%      95.198us      95.198us             1  
-                                              aten::mul         5.74%     155.881us         9.81%     266.305us      11.096us      48.927us        52.11%      48.927us       2.039us            24  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      48.927us        52.11%      48.927us       2.039us            24  
-                                            aten::copy_         3.95%     107.229us        67.43%       1.830ms     101.693us      30.753us        32.76%      32.065us       1.781us            18  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.881us        24.37%      22.881us       1.907us            12  
-void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.206us        15.13%      14.206us       1.184us            12  
-                                            aten::clone         0.99%      26.953us        64.69%       1.756ms     292.683us       0.000us         0.00%       9.184us       1.531us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.872us         8.38%       7.872us       1.312us             6  
-                                              aten::add         1.25%      33.910us         2.11%      57.361us       9.560us       7.103us         7.57%       7.103us       1.184us             6  
-                                              aten::sub         1.62%      44.010us         2.55%      69.231us      11.538us       7.103us         7.57%       7.103us       1.184us             6  
-                                Activity Buffer Request        53.49%       1.452ms        53.49%       1.452ms       1.452ms       1.312us         1.40%       1.312us       1.312us             1  
-                                    aten::empty_strided         1.24%      33.730us         1.24%      33.730us       5.622us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         7.66%     207.874us         7.66%     207.874us      34.646us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.54%      68.958us         3.31%      89.820us       3.743us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.77%      20.862us         0.77%      20.862us       0.869us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.19%     222.327us         8.19%     222.327us       4.632us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.18%       5.000us         0.18%       5.000us       5.000us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.018ms      1082.59%       1.018ms       1.018ms             1  
+                                            torch_eager        11.47%     329.955us        99.81%       2.870ms       2.870ms       0.000us         0.00%      95.358us      95.358us             1  
+                                              aten::mul         5.65%     162.614us         9.86%     283.677us      11.820us      49.056us        52.16%      49.056us       2.044us            24  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      49.056us        52.16%      49.056us       2.044us            24  
+                                            aten::copy_         3.88%     111.664us        68.17%       1.960ms     108.907us      30.720us        32.66%      32.032us       1.780us            18  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.880us        24.33%      22.880us       1.907us            12  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.270us        15.17%      14.270us       1.189us            12  
+                                            aten::clone         1.07%      30.831us        65.73%       1.890ms     315.021us       0.000us         0.00%       9.152us       1.525us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.840us         8.34%       7.840us       1.307us             6  
+                                              aten::add         1.15%      33.191us         2.07%      59.441us       9.907us       7.167us         7.62%       7.167us       1.194us             6  
+                                              aten::sub         1.59%      45.863us         2.59%      74.463us      12.411us       7.103us         7.55%       7.103us       1.184us             6  
+                                Activity Buffer Request        50.07%       1.440ms        50.07%       1.440ms       1.440ms       1.312us         1.40%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.26%      36.310us         1.26%      36.310us       6.052us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        11.96%     343.839us        11.96%     343.839us      57.306us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.64%      75.860us         3.31%      95.264us       3.969us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.67%      19.404us         0.67%      19.404us       0.809us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.38%     240.995us         8.38%     240.995us       5.021us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.19%       5.330us         0.19%       5.330us       5.330us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.715ms
-Self CUDA time total: 93.886us
+Self CPU time total: 2.876ms
+Self CUDA time total: 94.046us
 
 
 
@@ -4384,27 +4384,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     939.771us       930.81%     939.771us     939.771us             1  
-                                            torch_eager        11.42%     294.218us        99.78%       2.570ms       2.570ms       0.000us         0.00%     102.276us     102.276us             1  
-                                              aten::mul         5.85%     150.653us        10.08%     259.594us      10.816us      52.609us        52.11%      52.609us       2.192us            24  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      52.609us        52.11%      52.609us       2.192us            24  
-                                            aten::copy_         4.01%     103.273us        68.02%       1.752ms      97.337us      32.450us        32.14%      33.763us       1.876us            18  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.640us        24.40%      24.640us       2.053us            12  
-void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      15.904us        15.75%      15.904us       1.325us            12  
-                                            aten::clone         0.87%      22.360us        64.99%       1.674ms     278.983us       0.000us         0.00%       9.123us       1.520us             6  
-                                              aten::sub         1.58%      40.669us         2.53%      65.240us      10.873us       7.968us         7.89%       7.968us       1.328us             6  
-                                              aten::add         1.32%      33.930us         2.20%      56.580us       9.430us       7.936us         7.86%       7.936us       1.323us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.810us         7.74%       7.810us       1.302us             6  
-                                Activity Buffer Request        54.28%       1.398ms        54.28%       1.398ms       1.398ms       1.313us         1.30%       1.313us       1.313us             1  
-                                    aten::empty_strided         1.21%      31.291us         1.21%      31.291us       5.215us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         7.34%     188.943us         7.34%     188.943us      31.491us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.69%      69.330us         3.44%      88.671us       3.695us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.75%      19.341us         0.75%      19.341us       0.806us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.46%     218.003us         8.46%     218.003us       4.542us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.22%       5.651us         0.22%       5.651us       5.651us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     913.335us       900.40%     913.335us     913.335us             1  
+                                            torch_eager        10.58%     290.726us        99.81%       2.742ms       2.742ms       0.000us         0.00%     102.781us     102.781us             1  
+                                              aten::mul         5.30%     145.663us         9.31%     255.637us      10.652us      52.735us        51.99%      52.735us       2.197us            24  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      52.735us        51.99%      52.735us       2.197us            24  
+                                            aten::copy_         3.74%     102.751us        70.53%       1.937ms     107.622us      32.638us        32.18%      33.982us       1.888us            18  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.735us        24.38%      24.735us       2.061us            12  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.064us        15.84%      16.064us       1.339us            12  
+                                            aten::clone         0.88%      24.121us        67.96%       1.867ms     311.110us       0.000us         0.00%       9.247us       1.541us             6  
+                                              aten::sub         1.29%      35.411us         2.16%      59.202us       9.867us       8.033us         7.92%       8.033us       1.339us             6  
+                                              aten::add         1.13%      30.931us         1.93%      52.952us       8.825us       8.031us         7.92%       8.031us       1.339us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.903us         7.79%       7.903us       1.317us             6  
+                                Activity Buffer Request        52.85%       1.452ms        52.85%       1.452ms       1.452ms       1.344us         1.32%       1.344us       1.344us             1  
+                                    aten::empty_strided         1.21%      33.351us         1.21%      33.351us       5.559us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        11.71%     321.577us        11.71%     321.577us      53.596us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.55%      69.990us         3.22%      88.522us       3.688us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.67%      18.532us         0.67%      18.532us       0.772us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         7.90%     216.969us         7.90%     216.969us       4.520us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.19%       5.091us         0.19%       5.091us       5.091us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.576ms
-Self CUDA time total: 100.963us
+Self CPU time total: 2.747ms
+Self CUDA time total: 101.437us
 
 
 
@@ -4414,27 +4414,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     987.019us       820.52%     987.019us     987.019us             1  
-                                            torch_eager        11.12%     293.915us        99.79%       2.637ms       2.637ms       0.000us         0.00%     122.116us     122.116us             1  
-                                              aten::mul         6.22%     164.251us        10.48%     276.937us      11.539us      61.922us        51.48%      61.922us       2.580us            24  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      61.922us        51.48%      61.922us       2.580us            24  
-                                            aten::copy_         3.96%     104.584us        67.08%       1.772ms      98.461us      39.265us        32.64%      41.089us       2.283us            18  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.833us        23.97%      28.833us       2.403us            12  
-void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.105us        15.88%      19.105us       1.592us            12  
-                                            aten::clone         0.81%      21.321us        64.15%       1.695ms     282.483us       0.000us         0.00%      12.256us       2.043us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.432us         8.67%      10.432us       1.739us             6  
-                                              aten::sub         1.58%      41.691us         2.56%      67.622us      11.270us       9.569us         7.95%       9.569us       1.595us             6  
-                                              aten::add         1.31%      34.540us         2.17%      57.381us       9.563us       9.536us         7.93%       9.536us       1.589us             6  
-                                Activity Buffer Request        53.87%       1.423ms        53.87%       1.423ms       1.423ms       1.824us         1.52%       1.824us       1.824us             1  
-                                    aten::empty_strided         1.17%      30.940us         1.17%      30.940us       5.157us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         6.97%     184.193us         6.97%     184.193us      30.699us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.59%      94.920us         4.40%     116.150us       4.840us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.80%      21.230us         0.80%      21.230us       0.885us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.38%     221.517us         8.38%     221.517us       4.615us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.21%       5.631us         0.21%       5.631us       5.631us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     929.433us       768.61%     929.433us     929.433us             1  
+                                            torch_eager        10.84%     297.701us        99.80%       2.742ms       2.742ms       0.000us         0.00%     122.716us     122.716us             1  
+                                              aten::mul         5.42%     148.850us         9.41%     258.632us      10.776us      62.014us        51.28%      62.014us       2.584us            24  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      62.014us        51.28%      62.014us       2.584us            24  
+                                            aten::copy_         3.77%     103.682us        70.14%       1.927ms     107.043us      39.328us        32.52%      41.120us       2.284us            18  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.800us        23.82%      28.800us       2.400us            12  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.582us        16.19%      19.582us       1.632us            12  
+                                            aten::clone         0.88%      24.131us        67.45%       1.853ms     308.828us       0.000us         0.00%      12.320us       2.053us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.528us         8.71%      10.528us       1.755us             6  
+                                              aten::sub         1.29%      35.482us         2.16%      59.433us       9.905us       9.792us         8.10%       9.792us       1.632us             6  
+                                              aten::add         1.13%      31.104us         1.94%      53.172us       8.862us       9.790us         8.10%       9.790us       1.632us             6  
+                                Activity Buffer Request        52.94%       1.454ms        52.94%       1.454ms       1.454ms       1.792us         1.48%       1.792us       1.792us             1  
+                                    aten::empty_strided         1.18%      32.542us         1.18%      32.542us       5.424us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        11.19%     307.407us        11.19%     307.407us      51.235us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.56%      70.268us         3.25%      89.361us       3.723us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.70%      19.093us         0.70%      19.093us       0.796us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         7.91%     217.262us         7.91%     217.262us       4.526us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.20%       5.370us         0.20%       5.370us       5.370us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.642ms
-Self CUDA time total: 120.292us
+Self CPU time total: 2.747ms
+Self CUDA time total: 120.924us
 
 
 
@@ -4444,27 +4444,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     942.977us       547.62%     942.977us     942.977us             1  
-                                            torch_eager        11.98%     313.186us        99.77%       2.608ms       2.608ms       0.000us         0.00%     175.043us     175.043us             1  
-                                              aten::mul         5.92%     154.664us        10.07%     263.135us      10.964us      89.731us        52.11%      89.731us       3.739us            24  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      89.731us        52.11%      89.731us       3.739us            24  
-                                            aten::copy_         4.21%     110.022us        67.75%       1.771ms      98.397us      57.632us        33.47%      60.480us       3.360us            18  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.640us        23.60%      40.640us       3.387us            12  
-void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.832us        14.42%      24.832us       2.069us            12  
-                                            aten::clone         1.00%      26.050us        64.65%       1.690ms     281.685us       0.000us         0.00%      19.840us       3.307us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.992us         9.87%      16.992us       2.832us             6  
-                                              aten::add         1.22%      32.012us         2.08%      54.302us       9.050us      12.416us         7.21%      12.416us       2.069us             6  
-                                              aten::sub         1.48%      38.721us         2.41%      62.881us      10.480us      12.416us         7.21%      12.416us       2.069us             6  
-                                Activity Buffer Request        54.20%       1.417ms        54.20%       1.417ms       1.417ms       2.848us         1.65%       2.848us       2.848us             1  
-                                    aten::empty_strided         1.15%      30.180us         1.15%      30.180us       5.030us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         6.98%     182.574us         6.98%     182.574us      30.429us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.56%      66.979us         3.34%      87.351us       3.640us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.78%      20.372us         0.78%      20.372us       0.849us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.28%     216.491us         8.28%     216.491us       4.510us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.23%       5.900us         0.23%       5.900us       5.900us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     942.082us       549.37%     942.082us     942.082us             1  
+                                            torch_eager        20.10%     308.752us        99.67%       1.531ms       1.531ms       0.000us         0.00%     174.365us     174.365us             1  
+                                              aten::mul         9.79%     150.414us        16.96%     260.516us      10.855us      89.056us        51.93%      89.056us       3.711us            24  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      89.056us        51.93%      89.056us       3.711us            24  
+                                            aten::copy_         6.91%     106.224us        46.22%     710.060us      39.448us      57.503us        33.53%      60.383us       3.355us            18  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.511us        23.62%      40.511us       3.376us            12  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.926us        14.54%      24.926us       2.077us            12  
+                                            aten::clone         1.37%      21.029us        40.87%     627.796us     104.633us       0.000us         0.00%      19.872us       3.312us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.992us         9.91%      16.992us       2.832us             6  
+                                              aten::sub         2.26%      34.730us         3.83%      58.781us       9.797us      12.479us         7.28%      12.479us       2.080us             6  
+                                              aten::add         2.00%      30.683us         3.45%      52.973us       8.829us      12.447us         7.26%      12.447us       2.075us             6  
+                                Activity Buffer Request        16.15%     248.056us        16.15%     248.056us     248.056us       2.880us         1.68%       2.880us       2.880us             1  
+                                    aten::empty_strided         2.04%      31.392us         2.04%      31.392us       5.232us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        18.97%     291.479us        18.97%     291.479us      48.580us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.49%      68.986us         5.70%      87.586us       3.649us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.21%      18.600us         1.21%      18.600us       0.775us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        14.37%     220.744us        14.37%     220.744us       4.599us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.33%       5.080us         0.33%       5.080us       5.080us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.614ms
-Self CUDA time total: 172.195us
+Self CPU time total: 1.536ms
+Self CUDA time total: 171.485us
 
 
 
@@ -4474,27 +4474,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     954.334us       791.88%     954.334us     954.334us             1  
-                                            torch_eager        21.12%     286.823us        99.60%       1.352ms       1.352ms       0.000us         0.00%     122.339us     122.339us             1  
-                                              aten::mul        11.39%     154.733us        19.43%     263.854us      10.994us      61.889us        51.35%      61.889us       2.579us            24  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      61.889us        51.35%      61.889us       2.579us            24  
-                                            aten::copy_         8.06%     109.392us        38.94%     528.759us      29.376us      39.393us        32.69%      41.217us       2.290us            18  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.864us        23.95%      28.864us       2.405us            12  
-void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.233us        15.96%      19.233us       1.603us            12  
-                                            aten::clone         1.54%      20.901us        32.67%     443.638us      73.940us       0.000us         0.00%      12.353us       2.059us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.529us         8.74%      10.529us       1.755us             6  
-                                              aten::sub         2.93%      39.731us         4.81%      65.293us      10.882us       9.633us         7.99%       9.633us       1.606us             6  
-                                              aten::add         2.54%      34.552us         4.77%      64.792us      10.799us       9.600us         7.97%       9.600us       1.600us             6  
-                                Activity Buffer Request        12.72%     172.763us        12.72%     172.763us     172.763us       1.824us         1.51%       1.824us       1.824us             1  
-                                    aten::empty_strided         2.32%      31.561us         2.32%      31.561us       5.260us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        13.45%     182.623us        13.45%     182.623us      30.437us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         5.17%      70.140us         6.66%      90.481us       3.770us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.50%      20.341us         1.50%      20.341us       0.848us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        16.86%     228.904us        16.86%     228.904us       4.769us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.40%       5.490us         0.40%       5.490us       5.490us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     906.096us       748.31%     906.096us     906.096us             1  
+                                            torch_eager        18.91%     280.775us        99.66%       1.480ms       1.480ms       0.000us         0.00%     122.910us     122.910us             1  
+                                              aten::mul        10.01%     148.664us        17.45%     259.167us      10.799us      62.174us        51.35%      62.174us       2.591us            24  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      62.174us        51.35%      62.174us       2.591us            24  
+                                            aten::copy_         6.88%     102.100us        46.50%     690.526us      38.363us      39.392us        32.53%      41.216us       2.290us            18  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.800us        23.78%      28.800us       2.400us            12  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.520us        16.12%      19.520us       1.627us            12  
+                                            aten::clone         1.45%      21.579us        41.36%     614.176us     102.363us       0.000us         0.00%      12.416us       2.069us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.592us         8.75%      10.592us       1.765us             6  
+                                              aten::sub         2.32%      34.432us         3.90%      57.973us       9.662us       9.760us         8.06%       9.760us       1.627us             6  
+                                              aten::add         2.12%      31.432us         3.61%      53.552us       8.925us       9.760us         8.06%       9.760us       1.627us             6  
+                                Activity Buffer Request        17.05%     253.136us        17.05%     253.136us     253.136us       1.824us         1.51%       1.824us       1.824us             1  
+                                    aten::empty_strided         2.06%      30.533us         2.06%      30.533us       5.089us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        18.50%     274.717us        18.50%     274.717us      45.786us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.53%      67.311us         5.78%      85.812us       3.575us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.25%      18.501us         1.25%      18.501us       0.771us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        14.60%     216.737us        14.60%     216.737us       4.515us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.34%       4.981us         0.34%       4.981us       4.981us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.358ms
-Self CUDA time total: 120.515us
+Self CPU time total: 1.485ms
+Self CUDA time total: 121.086us
 
 
 
@@ -4504,27 +4504,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     961.439us       559.06%     961.439us     961.439us             1  
-                                            torch_eager        21.39%     301.083us        99.65%       1.403ms       1.403ms       0.000us         0.00%     174.821us     174.821us             1  
-                                              aten::mul        10.92%     153.723us        18.79%     264.437us      11.018us      89.541us        52.07%      89.541us       3.731us            24  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      89.541us        52.07%      89.541us       3.731us            24  
-                                            aten::copy_         8.57%     120.662us        41.11%     578.630us      32.146us      57.631us        33.51%      60.479us       3.360us            18  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.639us        23.63%      40.639us       3.387us            12  
-void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.801us        14.42%      24.801us       2.067us            12  
-                                            aten::clone         1.49%      21.022us        33.99%     478.490us      79.748us       0.000us         0.00%      19.840us       3.307us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.992us         9.88%      16.992us       2.832us             6  
-                                              aten::add         2.26%      31.841us         3.85%      54.131us       9.022us      12.481us         7.26%      12.481us       2.080us             6  
-                                              aten::sub         2.79%      39.260us         4.52%      63.691us      10.615us      12.320us         7.16%      12.320us       2.053us             6  
-                                Activity Buffer Request        15.02%     211.404us        15.02%     211.404us     211.404us       2.848us         1.66%       2.848us       2.848us             1  
-                                    aten::empty_strided         2.10%      29.500us         2.10%      29.500us       4.917us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        13.01%     183.184us        13.01%     183.184us      30.531us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.96%      69.812us         6.41%      90.211us       3.759us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.45%      20.399us         1.45%      20.399us       0.850us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        15.69%     220.815us        15.69%     220.815us       4.600us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.35%       4.890us         0.35%       4.890us       4.890us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     954.294us       555.32%     954.294us     954.294us             1  
+                                            torch_eager        11.21%     307.269us        99.82%       2.735ms       2.735ms       0.000us         0.00%     174.694us     174.694us             1  
+                                              aten::mul         5.59%     153.258us         9.69%     265.580us      11.066us      89.476us        52.07%      89.476us       3.728us            24  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      89.476us        52.07%      89.476us       3.728us            24  
+                                            aten::copy_         3.78%     103.631us        69.46%       1.903ms     105.735us      57.505us        33.46%      60.353us       3.353us            18  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.545us        23.59%      40.545us       3.379us            12  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.865us        14.47%      24.865us       2.072us            12  
+                                            aten::clone         0.89%      24.491us        66.72%       1.828ms     304.733us       0.000us         0.00%      19.808us       3.301us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.960us         9.87%      16.960us       2.827us             6  
+                                              aten::add         1.15%      31.480us         1.96%      53.761us       8.960us      12.448us         7.24%      12.448us       2.075us             6  
+                                              aten::sub         1.31%      35.801us         2.17%      59.462us       9.910us      12.417us         7.23%      12.417us       2.070us             6  
+                                Activity Buffer Request        53.91%       1.477ms        53.91%       1.477ms       1.477ms       2.848us         1.66%       2.848us       2.848us             1  
+                                    aten::empty_strided         1.13%      30.930us         1.13%      30.930us       5.155us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         9.51%     260.666us         9.51%     260.666us      43.444us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.58%      70.761us         3.30%      90.449us       3.769us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.72%      19.688us         0.72%      19.688us       0.820us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.03%     220.086us         8.03%     220.086us       4.585us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.18%       5.030us         0.18%       5.030us       5.030us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.408ms
-Self CUDA time total: 171.973us
+Self CPU time total: 2.740ms
+Self CUDA time total: 171.846us
 
 
 
@@ -4534,27 +4534,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     959.740us       338.81%     959.740us     959.740us             1  
-                                            torch_eager        11.78%     309.495us        99.81%       2.622ms       2.622ms       0.000us         0.00%     301.248us     301.248us             1  
-                                              aten::mul         5.80%     152.430us         9.98%     262.294us      10.929us     133.378us        47.09%     133.378us       5.557us            24  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     133.378us        47.09%     133.378us       5.557us            24  
-                                            aten::copy_         4.09%     107.511us        67.37%       1.770ms      98.338us     108.832us        38.42%     126.816us       7.045us            18  
-                                            aten::clone         1.07%      28.041us        64.54%       1.696ms     282.603us       0.000us         0.00%      69.600us      11.600us             6  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.216us        20.20%      57.216us       4.768us            12  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      51.616us        18.22%      51.616us       8.603us             6  
-void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      41.054us        14.49%      41.054us       3.421us            12  
-                                              aten::sub         1.57%      41.190us         2.52%      66.080us      11.013us      20.607us         7.27%      20.607us       3.434us             6  
-                                              aten::add         1.56%      40.972us         2.46%      64.512us      10.752us      20.447us         7.22%      20.447us       3.408us             6  
-                                Activity Buffer Request        53.79%       1.413ms        53.79%       1.413ms       1.413ms      17.984us         6.35%      17.984us      17.984us             1  
-                                    aten::empty_strided         1.19%      31.311us         1.19%      31.311us       5.218us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         7.14%     187.713us         7.14%     187.713us      31.285us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.66%      69.760us         3.44%      90.282us       3.762us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.78%      20.522us         0.78%      20.522us       0.855us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.37%     219.936us         8.37%     219.936us       4.582us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.19%       5.111us         0.19%       5.111us       5.111us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     917.943us       324.46%     917.943us     917.943us             1  
+                                            torch_eager        18.90%     277.703us        99.65%       1.464ms       1.464ms       0.000us         0.00%     301.376us     301.376us             1  
+                                              aten::mul         9.84%     144.586us        17.44%     256.139us      10.672us     132.736us        46.92%     132.736us       5.531us            24  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     132.736us        46.92%     132.736us       5.531us            24  
+                                            aten::copy_         7.06%     103.765us        45.63%     670.307us      37.239us     109.119us        38.57%     127.583us       7.088us            18  
+                                            aten::clone         1.58%      23.262us        40.78%     599.096us      99.849us       0.000us         0.00%      70.336us      11.723us             6  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.247us        20.23%      57.247us       4.771us            12  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      51.872us        18.34%      51.872us       8.645us             6  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      41.057us        14.51%      41.057us       3.421us            12  
+                                              aten::add         2.13%      31.271us         3.65%      53.632us       8.939us      20.545us         7.26%      20.545us       3.424us             6  
+                                              aten::sub         2.39%      35.109us         4.06%      59.711us       9.952us      20.512us         7.25%      20.512us       3.419us             6  
+                                Activity Buffer Request        16.07%     236.106us        16.07%     236.106us     236.106us      18.464us         6.53%      18.464us      18.464us             1  
+                                    aten::empty_strided         2.35%      34.500us         2.35%      34.500us       5.750us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        18.36%     269.767us        18.36%     269.767us      44.961us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.78%      70.183us         6.04%      88.753us       3.698us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.26%      18.570us         1.26%      18.570us       0.774us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        14.92%     219.185us        14.92%     219.185us       4.566us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.35%       5.090us         0.35%       5.090us       5.090us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.627ms
-Self CUDA time total: 283.264us
+Self CPU time total: 1.469ms
+Self CUDA time total: 282.912us
 
 
 
@@ -4564,27 +4564,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     964.299us       170.17%     964.299us     964.299us             1  
-                                            torch_eager        21.37%     289.253us        99.58%       1.348ms       1.348ms       0.000us         0.00%     590.419us     590.419us             1  
-                                            aten::copy_         7.69%     104.123us        37.93%     513.450us      28.525us     274.106us        48.37%     297.849us      16.547us            18  
-                                              aten::mul        11.75%     159.118us        20.07%     271.705us      11.321us     226.427us        39.96%     226.427us       9.434us            24  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     226.427us        39.96%     226.427us       9.434us            24  
-                                            aten::clone         1.55%      21.020us        32.53%     440.358us      73.393us       0.000us         0.00%     206.843us      34.474us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us     183.100us        32.31%     183.100us      30.517us             6  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      91.006us        16.06%      91.006us       7.584us            12  
-void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      66.143us        11.67%      66.143us       5.512us            12  
-                                              aten::sub         3.06%      41.432us         4.99%      67.562us      11.260us      33.664us         5.94%      33.664us       5.611us             6  
-                                              aten::add         2.43%      32.930us         4.17%      56.451us       9.408us      32.479us         5.73%      32.479us       5.413us             6  
-                                Activity Buffer Request        11.95%     161.793us        11.95%     161.793us     161.793us      23.743us         4.19%      23.743us      23.743us             1  
-                                    aten::empty_strided         2.85%      38.611us         2.85%      38.611us       6.435us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        13.59%     183.934us        13.59%     183.934us      30.656us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         5.13%      69.460us         6.64%      89.941us       3.748us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.51%      20.481us         1.51%      20.481us       0.853us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        16.68%     225.838us        16.68%     225.838us       4.705us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.42%       5.710us         0.42%       5.710us       5.710us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     931.832us       165.35%     931.832us     931.832us             1  
+                                            torch_eager        19.27%     283.137us        99.64%       1.464ms       1.464ms       0.000us         0.00%     587.261us     587.261us             1  
+                                            aten::copy_         7.04%     103.435us        44.90%     659.587us      36.644us     272.511us        48.36%     296.223us      16.457us            18  
+                                              aten::mul        10.36%     152.225us        18.18%     267.110us      11.130us     224.829us        39.90%     224.829us       9.368us            24  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     224.829us        39.90%     224.829us       9.368us            24  
+                                            aten::clone         1.47%      21.550us        39.53%     580.673us      96.779us       0.000us         0.00%     205.855us      34.309us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us     182.143us        32.32%     182.143us      30.357us             6  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      90.368us        16.04%      90.368us       7.531us            12  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      66.209us        11.75%      66.209us       5.517us            12  
+                                              aten::sub         2.39%      35.041us         4.07%      59.831us       9.972us      33.760us         5.99%      33.760us       5.627us             6  
+                                              aten::add         2.15%      31.591us         3.70%      54.401us       9.067us      32.449us         5.76%      32.449us       5.408us             6  
+                                Activity Buffer Request        16.23%     238.406us        16.23%     238.406us     238.406us      23.712us         4.21%      23.712us      23.712us             1  
+                                    aten::empty_strided         2.04%      29.960us         2.04%      29.960us       4.993us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        17.39%     255.475us        17.39%     255.475us      42.579us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.73%      69.441us         6.00%      88.092us       3.670us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.27%      18.651us         1.27%      18.651us       0.777us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.30%     224.756us        15.30%     224.756us       4.682us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.36%       5.280us         0.36%       5.280us       5.280us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.354ms
-Self CUDA time total: 566.676us
+Self CPU time total: 1.469ms
+Self CUDA time total: 563.549us
 
 
 
@@ -4594,27 +4594,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     940.757us      1018.68%     940.757us     940.757us             1  
-                                            torch_eager        20.92%     284.932us        99.61%       1.357ms       1.357ms       0.000us         0.00%      93.503us      93.503us             1  
-                                              aten::mul        11.51%     156.743us        19.57%     266.566us      11.107us      49.664us        53.78%      49.664us       2.069us            24  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      49.664us        53.78%      49.664us       2.069us            24  
-                                            aten::copy_         7.76%     105.742us        39.84%     542.619us      30.146us      29.343us        31.77%      30.495us       1.694us            18  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.528us        24.39%      22.528us       1.877us            12  
-void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.344us        14.45%      13.344us       1.112us            12  
-                                            aten::clone         1.52%      20.734us        33.85%     461.099us      76.850us       0.000us         0.00%       7.967us       1.328us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.815us         7.38%       6.815us       1.136us             6  
-                                              aten::sub         2.96%      40.252us         4.79%      65.263us      10.877us       6.688us         7.24%       6.688us       1.115us             6  
-                                              aten::add         2.34%      31.811us         3.99%      54.311us       9.052us       6.656us         7.21%       6.656us       1.109us             6  
-                                Activity Buffer Request        14.09%     191.853us        14.09%     191.853us     191.853us       1.152us         1.25%       1.152us       1.152us             1  
-                                    aten::empty_strided         2.30%      31.379us         2.30%      31.379us       5.230us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        13.46%     183.403us        13.46%     183.403us      30.567us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         5.20%      70.859us         6.67%      90.910us       3.788us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.47%      20.051us         1.47%      20.051us       0.835us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        16.08%     218.955us        16.08%     218.955us       4.562us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.39%       5.360us         0.39%       5.360us       5.360us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     948.157us      1025.28%     948.157us     948.157us             1  
+                                            torch_eager        11.31%     303.890us        99.80%       2.681ms       2.681ms       0.000us         0.00%      93.597us      93.597us             1  
+                                              aten::mul         5.70%     153.152us         9.94%     267.009us      11.125us      49.696us        53.74%      49.696us       2.071us            24  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      49.696us        53.74%      49.696us       2.071us            24  
+                                            aten::copy_         3.75%     100.883us        69.10%       1.857ms     103.143us      29.375us        31.76%      30.494us       1.694us            18  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.592us        24.43%      22.592us       1.883us            12  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.407us        14.50%      13.407us       1.117us            12  
+                                            aten::clone         0.85%      22.792us        66.32%       1.782ms     296.986us       0.000us         0.00%       7.902us       1.317us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.783us         7.33%       6.783us       1.131us             6  
+                                              aten::sub         1.31%      35.191us         2.17%      58.341us       9.724us       6.720us         7.27%       6.720us       1.120us             6  
+                                              aten::add         1.15%      30.820us         1.98%      53.181us       8.863us       6.687us         7.23%       6.687us       1.114us             6  
+                                Activity Buffer Request        53.95%       1.449ms        53.95%       1.449ms       1.449ms       1.119us         1.21%       1.119us       1.119us             1  
+                                    aten::empty_strided         1.15%      30.830us         1.15%      30.830us       5.138us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         9.13%     245.326us         9.13%     245.326us      40.888us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.61%      70.171us         3.31%      88.830us       3.701us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.69%      18.659us         0.69%      18.659us       0.777us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.20%     220.298us         8.20%     220.298us       4.590us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.20%       5.250us         0.20%       5.250us       5.250us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.362ms
-Self CUDA time total: 92.351us
+Self CPU time total: 2.687ms
+Self CUDA time total: 92.478us
 
 
 
@@ -4624,27 +4624,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     945.706us       986.10%     945.706us     945.706us             1  
-                                            torch_eager        12.18%     322.968us        99.79%       2.647ms       2.647ms       0.000us         0.00%      97.216us      97.216us             1  
-                                              aten::mul         5.85%     155.091us         9.99%     264.924us      11.039us      50.947us        53.12%      50.947us       2.123us            24  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      50.947us        53.12%      50.947us       2.123us            24  
-                                            aten::copy_         3.92%     103.931us        67.30%       1.785ms      99.174us      30.783us        32.10%      32.095us       1.783us            18  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.976us        23.96%      22.976us       1.915us            12  
-void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.174us        14.78%      14.174us       1.181us            12  
-                                            aten::clone         1.18%      31.280us        64.70%       1.716ms     286.035us       0.000us         0.00%       9.119us       1.520us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.807us         8.14%       7.807us       1.301us             6  
-                                              aten::add         1.22%      32.380us         2.09%      55.311us       9.219us       7.102us         7.41%       7.102us       1.184us             6  
-                                              aten::sub         1.50%      39.882us         2.41%      63.892us      10.649us       7.072us         7.37%       7.072us       1.179us             6  
-                                Activity Buffer Request        53.95%       1.431ms        53.95%       1.431ms       1.431ms       1.312us         1.37%       1.312us       1.312us             1  
-                                    aten::empty_strided         1.23%      32.600us         1.23%      32.600us       5.433us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         7.05%     187.002us         7.05%     187.002us      31.167us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.63%      69.642us         3.43%      90.901us       3.788us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.80%      21.259us         0.80%      21.259us       0.886us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.29%     220.006us         8.29%     220.006us       4.583us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.21%       5.569us         0.21%       5.569us       5.569us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     924.823us       959.84%     924.823us     924.823us             1  
+                                            torch_eager        19.47%     279.525us        99.65%       1.430ms       1.430ms       0.000us         0.00%      97.664us      97.664us             1  
+                                              aten::mul        10.27%     147.364us        19.04%     273.370us      11.390us      51.165us        53.10%      51.165us       2.132us            24  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      51.165us        53.10%      51.165us       2.132us            24  
+                                            aten::copy_         7.14%     102.519us        43.74%     627.869us      34.882us      30.913us        32.08%      32.225us       1.790us            18  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      23.040us        23.91%      23.040us       1.920us            12  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.274us        14.81%      14.274us       1.189us            12  
+                                            aten::clone         1.45%      20.838us        38.33%     550.144us      91.691us       0.000us         0.00%       9.185us       1.531us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.873us         8.17%       7.873us       1.312us             6  
+                                              aten::add         2.18%      31.279us         3.75%      53.900us       8.983us       7.137us         7.41%       7.137us       1.189us             6  
+                                              aten::sub         2.45%      35.101us         4.11%      58.931us       9.822us       7.137us         7.41%       7.137us       1.189us             6  
+                                Activity Buffer Request        15.34%     220.215us        15.34%     220.215us     220.215us       1.312us         1.36%       1.312us       1.312us             1  
+                                    aten::empty_strided         2.15%      30.891us         2.15%      30.891us       5.148us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        17.11%     245.545us        17.11%     245.545us      40.924us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.62%      66.322us         5.93%      85.082us       3.545us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.31%      18.760us         1.31%      18.760us       0.782us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        16.17%     232.047us        16.17%     232.047us       4.834us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.35%       5.041us         0.35%       5.041us       5.041us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.653ms
-Self CUDA time total: 95.904us
+Self CPU time total: 1.435ms
+Self CUDA time total: 96.352us
 
 
 
@@ -4654,27 +4654,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     963.956us       929.78%     963.956us     963.956us             1  
-                                            torch_eager        11.95%     315.942us        99.78%       2.637ms       2.637ms       0.000us         0.00%     104.988us     104.988us             1  
-                                              aten::mul         6.01%     158.721us        10.21%     269.951us      11.248us      55.295us        53.33%      55.295us       2.304us            24  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      55.295us        53.33%      55.295us       2.304us            24  
-                                            aten::copy_         4.03%     106.403us        67.45%       1.783ms      99.031us      32.417us        31.27%      33.729us       1.874us            18  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.607us        23.73%      24.607us       2.051us            12  
-void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      15.964us        15.40%      15.964us       1.330us            12  
-                                            aten::clone         1.02%      26.870us        64.62%       1.708ms     284.615us       0.000us         0.00%       9.122us       1.520us             6  
-                                              aten::add         1.23%      32.629us         2.10%      55.390us       9.232us       7.997us         7.71%       7.997us       1.333us             6  
-                                              aten::sub         1.44%      38.041us         2.36%      62.260us      10.377us       7.967us         7.68%       7.967us       1.328us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.810us         7.53%       7.810us       1.302us             6  
-                                Activity Buffer Request        54.08%       1.429ms        54.08%       1.429ms       1.429ms       1.312us         1.27%       1.312us       1.312us             1  
-                                    aten::empty_strided         1.27%      33.640us         1.27%      33.640us       5.607us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         6.95%     183.544us         6.95%     183.544us      30.591us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.64%      69.789us         3.42%      90.471us       3.770us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.78%      20.682us         0.78%      20.682us       0.862us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.39%     221.610us         8.39%     221.610us       4.617us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.22%       5.700us         0.22%       5.700us       5.700us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     915.886us       880.13%     915.886us     915.886us             1  
+                                            torch_eager        19.45%     278.057us        99.65%       1.425ms       1.425ms       0.000us         0.00%     105.374us     105.374us             1  
+                                              aten::mul        10.44%     149.250us        18.09%     258.645us      10.777us      55.325us        53.17%      55.325us       2.305us            24  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      55.325us        53.17%      55.325us       2.305us            24  
+                                            aten::copy_         7.22%     103.283us        44.53%     636.707us      35.373us      32.575us        31.30%      33.887us       1.883us            18  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.703us        23.74%      24.703us       2.059us            12  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.162us        15.53%      16.162us       1.347us            12  
+                                            aten::clone         1.49%      21.291us        38.97%     557.204us      92.867us       0.000us         0.00%       9.184us       1.531us             6  
+                                              aten::sub         2.42%      34.610us         4.09%      58.491us       9.749us       8.096us         7.78%       8.096us       1.349us             6  
+                                              aten::add         2.18%      31.210us         3.76%      53.710us       8.952us       8.066us         7.75%       8.066us       1.344us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.872us         7.56%       7.872us       1.312us             6  
+                                Activity Buffer Request        15.88%     227.005us        15.88%     227.005us     227.005us       1.312us         1.26%       1.312us       1.312us             1  
+                                    aten::empty_strided         2.12%      30.341us         2.12%      30.341us       5.057us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        17.11%     244.667us        17.11%     244.667us      40.778us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.81%      68.755us         6.12%      87.484us       3.645us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.31%      18.729us         1.31%      18.729us       0.780us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.21%     217.528us        15.21%     217.528us       4.532us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.35%       5.011us         0.35%       5.011us       5.011us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.643ms
-Self CUDA time total: 103.676us
+Self CPU time total: 1.430ms
+Self CUDA time total: 104.062us
 
 
 
@@ -4684,27 +4684,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     933.942us       757.68%     933.942us     933.942us             1  
-                                            torch_eager        21.17%     287.829us        99.59%       1.354ms       1.354ms       0.000us         0.00%     125.024us     125.024us             1  
-                                              aten::mul        11.38%     154.770us        19.33%     262.774us      10.949us      64.862us        52.62%      64.862us       2.703us            24  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      64.862us        52.62%      64.862us       2.703us            24  
-                                            aten::copy_         7.76%     105.560us        40.17%     546.058us      30.337us      39.265us        31.85%      41.025us       2.279us            18  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.865us        23.42%      28.865us       2.405us            12  
-void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.137us        15.53%      19.137us       1.595us            12  
-                                            aten::clone         1.51%      20.520us        34.08%     463.317us      77.220us       0.000us         0.00%      12.160us       2.027us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.400us         8.44%      10.400us       1.733us             6  
-                                              aten::sub         2.90%      39.471us         4.67%      63.511us      10.585us       9.569us         7.76%       9.569us       1.595us             6  
-                                              aten::add         2.50%      34.030us         4.22%      57.431us       9.572us       9.568us         7.76%       9.568us       1.595us             6  
-                                Activity Buffer Request        14.30%     194.363us        14.30%     194.363us     194.363us       1.760us         1.43%       1.760us       1.760us             1  
-                                    aten::empty_strided         2.23%      30.321us         2.23%      30.321us       5.053us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        13.45%     182.914us        13.45%     182.914us      30.486us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.84%      65.748us         6.29%      85.480us       3.562us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.45%      19.732us         1.45%      19.732us       0.822us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        16.08%     218.666us        16.08%     218.666us       4.556us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.41%       5.560us         0.41%       5.560us       5.560us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     926.227us       747.17%     926.227us     926.227us             1  
+                                            torch_eager        10.87%     288.725us        99.79%       2.651ms       2.651ms       0.000us         0.00%     125.755us     125.755us             1  
+                                              aten::mul         5.66%     150.315us         9.84%     261.507us      10.896us      65.119us        52.53%      65.119us       2.713us            24  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      65.119us        52.53%      65.119us       2.713us            24  
+                                            aten::copy_         3.77%     100.152us        69.45%       1.845ms     102.495us      39.455us        31.83%      41.246us       2.291us            18  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.928us        23.34%      28.928us       2.411us            12  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.390us        15.64%      19.390us       1.616us            12  
+                                            aten::clone         0.89%      23.522us        66.73%       1.773ms     295.426us       0.000us         0.00%      12.318us       2.053us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.527us         8.49%      10.527us       1.755us             6  
+                                              aten::add         1.16%      30.840us         2.00%      53.221us       8.870us       9.759us         7.87%       9.759us       1.626us             6  
+                                              aten::sub         1.31%      34.853us         2.22%      58.863us       9.811us       9.631us         7.77%       9.631us       1.605us             6  
+                                Activity Buffer Request        54.50%       1.448ms        54.50%       1.448ms       1.448ms       1.791us         1.44%       1.791us       1.791us             1  
+                                    aten::empty_strided         1.16%      30.740us         1.16%      30.740us       5.123us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.93%     237.245us         8.93%     237.245us      39.541us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.65%      70.502us         3.36%      89.223us       3.718us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.70%      18.721us         0.70%      18.721us       0.780us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.19%     217.516us         8.19%     217.516us       4.532us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.21%       5.590us         0.21%       5.590us       5.590us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.359ms
-Self CUDA time total: 123.264us
+Self CPU time total: 2.656ms
+Self CUDA time total: 123.964us
 
 
 
@@ -4714,27 +4714,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     934.359us       900.66%     934.359us     934.359us             1  
-                                            torch_eager        21.17%     286.322us        99.59%       1.347ms       1.347ms       0.000us         0.00%     105.086us     105.086us             1  
-                                              aten::mul        11.62%     157.214us        19.66%     265.945us      11.081us      55.327us        53.33%      55.327us       2.305us            24  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      55.327us        53.33%      55.327us       2.305us            24  
-                                            aten::copy_         7.65%     103.495us        39.66%     536.482us      29.805us      32.511us        31.34%      33.855us       1.881us            18  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.704us        23.81%      24.704us       2.059us            12  
-void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      15.904us        15.33%      15.904us       1.325us            12  
-                                            aten::clone         1.57%      21.280us        33.91%     458.650us      76.442us       0.000us         0.00%       9.151us       1.525us             6  
-                                              aten::add         2.43%      32.883us         4.09%      55.372us       9.229us       8.001us         7.71%       8.001us       1.333us             6  
-                                              aten::sub         2.87%      38.810us         4.64%      62.781us      10.463us       7.903us         7.62%       7.903us       1.317us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.807us         7.53%       7.807us       1.301us             6  
-                                Activity Buffer Request        14.06%     190.184us        14.06%     190.184us     190.184us       1.344us         1.30%       1.344us       1.344us             1  
-                                    aten::empty_strided         2.22%      30.070us         2.22%      30.070us       5.012us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        13.39%     181.103us        13.39%     181.103us      30.184us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         5.05%      68.302us         6.56%      88.771us       3.699us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.51%      20.469us         1.51%      20.469us       0.853us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        16.03%     216.891us        16.03%     216.891us       4.519us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.41%       5.591us         0.41%       5.591us       5.591us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     925.174us       889.06%     925.174us     925.174us             1  
+                                            torch_eager        20.56%     293.458us        99.64%       1.423ms       1.423ms       0.000us         0.00%     105.438us     105.438us             1  
+                                              aten::mul        10.42%     148.708us        18.32%     261.500us      10.896us      55.264us        53.11%      55.264us       2.303us            24  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      55.264us        53.11%      55.264us       2.303us            24  
+                                            aten::copy_         7.08%     101.081us        43.33%     618.656us      34.370us      32.670us        31.39%      34.046us       1.891us            18  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.703us        23.74%      24.703us       2.059us            12  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.128us        15.50%      16.128us       1.344us            12  
+                                            aten::clone         1.49%      21.220us        38.03%     542.913us      90.485us       0.000us         0.00%       9.343us       1.557us             6  
+                                              aten::sub         2.38%      33.992us         4.03%      57.481us       9.580us       8.064us         7.75%       8.064us       1.344us             6  
+                                              aten::add         2.21%      31.510us         3.80%      54.250us       9.042us       8.064us         7.75%       8.064us       1.344us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.967us         7.66%       7.967us       1.328us             6  
+                                Activity Buffer Request        14.99%     214.036us        14.99%     214.036us     214.036us       1.376us         1.32%       1.376us       1.376us             1  
+                                    aten::empty_strided         2.13%      30.461us         2.13%      30.461us       5.077us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        17.05%     243.458us        17.05%     243.458us      40.576us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.68%      66.831us         5.99%      85.500us       3.562us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.31%      18.669us         1.31%      18.669us       0.778us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.35%     219.102us        15.35%     219.102us       4.565us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.36%       5.101us         0.36%       5.101us       5.101us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.353ms
-Self CUDA time total: 103.742us
+Self CPU time total: 1.428ms
+Self CUDA time total: 104.062us
 
 
 
@@ -4744,27 +4744,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     944.694us       764.03%     944.694us     944.694us             1  
-                                            torch_eager        20.48%     287.824us        99.60%       1.400ms       1.400ms       0.000us         0.00%     125.438us     125.438us             1  
-                                              aten::mul        10.91%     153.363us        18.83%     264.625us      11.026us      65.151us        52.69%      65.151us       2.715us            24  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      65.151us        52.69%      65.151us       2.715us            24  
-                                            aten::copy_         7.88%     110.793us        41.73%     586.532us      32.585us      39.328us        31.81%      41.120us       2.284us            18  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.895us        23.37%      28.895us       2.408us            12  
-void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.167us        15.50%      19.167us       1.597us            12  
-                                            aten::clone         1.52%      21.310us        35.87%     504.089us      84.015us       0.000us         0.00%      12.225us       2.038us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.433us         8.44%      10.433us       1.739us             6  
-                                              aten::sub         2.80%      39.332us         4.57%      64.213us      10.702us       9.632us         7.79%       9.632us       1.605us             6  
-                                              aten::add         2.33%      32.799us         3.97%      55.790us       9.298us       9.535us         7.71%       9.535us       1.589us             6  
-                                Activity Buffer Request        15.08%     211.984us        15.08%     211.984us     211.984us       1.792us         1.45%       1.792us       1.792us             1  
-                                    aten::empty_strided         2.18%      30.690us         2.18%      30.690us       5.115us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        14.35%     201.734us        14.35%     201.734us      33.622us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.89%      68.724us         6.32%      88.851us       3.702us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.43%      20.127us         1.43%      20.127us       0.839us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        15.74%     221.155us        15.74%     221.155us       4.607us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.40%       5.570us         0.40%       5.570us       5.570us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     901.909us       727.35%     901.909us     901.909us             1  
+                                            torch_eager        19.87%     274.810us        99.60%       1.377ms       1.377ms       0.000us         0.00%     125.791us     125.791us             1  
+                                              aten::mul        10.85%     149.967us        18.79%     259.807us      10.825us      65.086us        52.49%      65.086us       2.712us            24  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      65.086us        52.49%      65.086us       2.712us            24  
+                                            aten::copy_         7.46%     103.216us        42.83%     592.168us      32.898us      39.518us        31.87%      41.310us       2.295us            18  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.862us        23.28%      28.862us       2.405us            12  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.395us        15.64%      19.395us       1.616us            12  
+                                            aten::clone         1.61%      22.200us        37.56%     519.385us      86.564us       0.000us         0.00%      12.448us       2.075us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.656us         8.59%      10.656us       1.776us             6  
+                                              aten::add         2.23%      30.899us         3.81%      52.660us       8.777us       9.730us         7.85%       9.730us       1.622us             6  
+                                              aten::sub         2.44%      33.801us         4.13%      57.151us       9.525us       9.665us         7.79%       9.665us       1.611us             6  
+                                Activity Buffer Request        13.62%     188.345us        13.62%     188.345us     188.345us       1.792us         1.45%       1.792us       1.792us             1  
+                                    aten::empty_strided         2.34%      32.371us         2.34%      32.371us       5.395us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        17.39%     240.467us        17.39%     240.467us      40.078us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.87%      67.397us         6.22%      86.038us       3.585us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.35%      18.641us         1.35%      18.641us       0.777us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.56%     215.091us        15.56%     215.091us       4.481us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.40%       5.540us         0.40%       5.540us       5.540us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.405ms
-Self CUDA time total: 123.646us
+Self CPU time total: 1.383ms
+Self CUDA time total: 123.999us
 
 
 
@@ -4774,27 +4774,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     938.077us       529.63%     938.077us     938.077us             1  
-                                            torch_eager        22.00%     288.844us        99.57%       1.307ms       1.307ms       0.000us         0.00%     179.967us     179.967us             1  
-                                              aten::mul        11.92%     156.562us        20.13%     264.245us      11.010us      94.881us        53.57%      94.881us       3.953us            24  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      94.881us        53.57%      94.881us       3.953us            24  
-                                            aten::copy_         8.04%     105.524us        37.72%     495.290us      27.516us      57.663us        32.56%      60.511us       3.362us            18  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.672us        22.96%      40.672us       3.389us            12  
-void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.575us        13.87%      24.575us       2.048us            12  
-                                            aten::clone         1.60%      21.071us        31.51%     413.758us      68.960us       0.000us         0.00%      19.839us       3.306us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.991us         9.59%      16.991us       2.832us             6  
-                                              aten::add         2.42%      31.800us         4.16%      54.561us       9.093us      12.288us         6.94%      12.288us       2.048us             6  
-                                              aten::sub         3.05%      40.090us         5.01%      65.752us      10.959us      12.287us         6.94%      12.287us       2.048us             6  
-                                Activity Buffer Request        10.75%     141.113us        10.75%     141.113us     141.113us       2.848us         1.61%       2.848us       2.848us             1  
-                                    aten::empty_strided         2.28%      29.940us         2.28%      29.940us       4.990us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        14.21%     186.543us        14.21%     186.543us      31.091us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         5.18%      67.990us         6.68%      87.660us       3.652us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.50%      19.670us         1.50%      19.670us       0.820us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        16.62%     218.216us        16.62%     218.216us       4.546us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.43%       5.650us         0.43%       5.650us       5.650us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     944.661us       533.26%     944.661us     944.661us             1  
+                                            torch_eager        10.70%     284.298us        99.79%       2.652ms       2.652ms       0.000us         0.00%     180.029us     180.029us             1  
+                                              aten::mul         6.06%     161.074us        10.27%     272.980us      11.374us      94.781us        53.50%      94.781us       3.949us            24  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      94.781us        53.50%      94.781us       3.949us            24  
+                                            aten::copy_         3.97%     105.392us        69.06%       1.835ms     101.961us      57.664us        32.55%      60.545us       3.364us            18  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.607us        22.92%      40.607us       3.384us            12  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.703us        13.94%      24.703us       2.059us            12  
+                                            aten::clone         0.89%      23.759us        66.19%       1.759ms     293.179us       0.000us         0.00%      19.938us       3.323us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.057us         9.63%      17.057us       2.843us             6  
+                                              aten::sub         1.37%      36.511us         2.33%      61.971us      10.329us      12.383us         6.99%      12.383us       2.064us             6  
+                                              aten::add         1.17%      31.070us         2.01%      53.400us       8.900us      12.320us         6.95%      12.320us       2.053us             6  
+                                Activity Buffer Request        53.91%       1.433ms        53.91%       1.433ms       1.433ms       2.881us         1.63%       2.881us       2.881us             1  
+                                    aten::empty_strided         1.17%      31.132us         1.17%      31.132us       5.189us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.85%     235.245us         8.85%     235.245us      39.208us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.64%      70.123us         3.36%      89.202us       3.717us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.72%      19.079us         0.72%      19.079us       0.795us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.35%     221.788us         8.35%     221.788us       4.621us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.21%       5.460us         0.21%       5.460us       5.460us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.313ms
-Self CUDA time total: 177.119us
+Self CPU time total: 2.657ms
+Self CUDA time total: 177.148us
 
 
 
@@ -4804,27 +4804,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     945.076us       318.26%     945.076us     945.076us             1  
-                                            torch_eager        21.55%     289.808us        99.58%       1.339ms       1.339ms       0.000us         0.00%     314.171us     314.171us             1  
-                                              aten::mul        11.43%     153.633us        19.62%     263.817us      10.992us     145.952us        49.15%     145.952us       6.081us            24  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     145.952us        49.15%     145.952us       6.081us            24  
-                                            aten::copy_         9.11%     122.489us        38.99%     524.297us      29.128us     110.173us        37.10%     127.389us       7.077us            18  
-                                            aten::clone         1.65%      22.169us        33.13%     445.468us      74.245us       0.000us         0.00%      70.110us      11.685us             6  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.279us        19.29%      57.279us       4.773us            12  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      52.894us        17.81%      52.894us       8.816us             6  
-void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      40.830us        13.75%      40.830us       3.402us            12  
-                                              aten::sub         2.94%      39.549us         4.81%      64.690us      10.782us      20.511us         6.91%      20.511us       3.418us             6  
-                                              aten::add         2.41%      32.411us         4.09%      55.020us       9.170us      20.319us         6.84%      20.319us       3.386us             6  
-                                Activity Buffer Request        11.32%     152.193us        11.32%     152.193us     152.193us      17.216us         5.80%      17.216us      17.216us             1  
-                                    aten::empty_strided         2.31%      31.082us         2.31%      31.082us       5.180us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        13.88%     186.593us        13.88%     186.593us      31.099us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         5.09%      68.450us         6.56%      88.160us       3.673us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.47%      19.710us         1.47%      19.710us       0.821us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        16.43%     220.956us        16.43%     220.956us       4.603us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.42%       5.661us         0.42%       5.661us       5.661us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     954.138us       321.69%     954.138us     954.138us             1  
+                                            torch_eager        11.45%     309.471us        99.80%       2.697ms       2.697ms       0.000us         0.00%     313.854us     313.854us             1  
+                                              aten::mul         5.62%     151.933us         9.84%     265.955us      11.081us     144.896us        48.85%     144.896us       6.037us            24  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     144.896us        48.85%     144.896us       6.037us            24  
+                                            aten::copy_         3.99%     107.722us        68.69%       1.856ms     103.120us     111.039us        37.44%     128.287us       7.127us            18  
+                                            aten::clone         1.05%      28.369us        65.82%       1.779ms     296.444us       0.000us         0.00%      70.944us      11.824us             6  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.343us        19.33%      57.343us       4.779us            12  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.696us        18.10%      53.696us       8.949us             6  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      40.671us        13.71%      40.671us       3.389us            12  
+                                              aten::sub         1.32%      35.620us         2.23%      60.211us      10.035us      20.448us         6.89%      20.448us       3.408us             6  
+                                              aten::add         1.16%      31.420us         1.99%      53.831us       8.972us      20.223us         6.82%      20.223us       3.371us             6  
+                                Activity Buffer Request        53.66%       1.450ms        53.66%       1.450ms       1.450ms      17.248us         5.82%      17.248us      17.248us             1  
+                                    aten::empty_strided         1.25%      33.832us         1.25%      33.832us       5.639us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.57%     231.556us         8.57%     231.556us      38.593us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.58%      69.773us         3.29%      88.953us       3.706us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.71%      19.180us         0.71%      19.180us       0.799us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.44%     228.015us         8.44%     228.015us       4.750us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.20%       5.370us         0.20%       5.370us       5.370us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.345ms
-Self CUDA time total: 296.955us
+Self CPU time total: 2.702ms
+Self CUDA time total: 296.606us
 
 
 
@@ -4834,27 +4834,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     986.080us       556.73%     986.080us     986.080us             1  
-                                            torch_eager        12.52%     336.567us        99.81%       2.683ms       2.683ms       0.000us         0.00%     179.999us     179.999us             1  
-                                              aten::mul         5.82%     156.365us         9.99%     268.575us      11.191us      94.976us        53.62%      94.976us       3.957us            24  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      94.976us        53.62%      94.976us       3.957us            24  
-                                            aten::copy_         3.98%     106.939us        67.04%       1.802ms     100.094us      57.535us        32.48%      60.415us       3.356us            18  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.703us        22.98%      40.703us       3.392us            12  
-void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.608us        13.89%      24.608us       2.051us            12  
-                                            aten::clone         1.08%      29.091us        64.22%       1.726ms     287.668us       0.000us         0.00%      19.712us       3.285us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.832us         9.50%      16.832us       2.805us             6  
-                                              aten::add         1.21%      32.499us         2.06%      55.240us       9.207us      12.320us         6.96%      12.320us       2.053us             6  
-                                              aten::sub         1.59%      42.650us         2.57%      69.041us      11.507us      12.288us         6.94%      12.288us       2.048us             6  
-                                Activity Buffer Request        53.52%       1.438ms        53.52%       1.438ms       1.438ms       2.880us         1.63%       2.880us       2.880us             1  
-                                    aten::empty_strided         1.16%      31.221us         1.16%      31.221us       5.204us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         7.20%     193.473us         7.20%     193.473us      32.245us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.61%      70.195us         3.39%      91.232us       3.801us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.78%      21.037us         0.78%      21.037us       0.877us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.35%     224.324us         8.35%     224.324us       4.673us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.19%       4.980us         0.19%       4.980us       4.980us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     930.130us       525.53%     930.130us     930.130us             1  
+                                            torch_eager        19.64%     282.826us        99.65%       1.435ms       1.435ms       0.000us         0.00%     179.836us     179.836us             1  
+                                              aten::mul        10.48%     150.844us        18.43%     265.387us      11.058us      94.845us        53.59%      94.845us       3.952us            24  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      94.845us        53.59%      94.845us       3.952us            24  
+                                            aten::copy_         8.38%     120.684us        44.09%     634.887us      35.272us      57.502us        32.49%      60.350us       3.353us            18  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.478us        22.87%      40.478us       3.373us            12  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.641us        13.92%      24.641us       2.053us            12  
+                                            aten::clone         1.49%      21.461us        38.48%     554.053us      92.342us       0.000us         0.00%      19.872us       3.312us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.024us         9.62%      17.024us       2.837us             6  
+                                              aten::sub         2.41%      34.731us         4.09%      58.881us       9.813us      12.353us         6.98%      12.353us       2.059us             6  
+                                              aten::add         2.13%      30.662us         3.72%      53.511us       8.919us      12.288us         6.94%      12.288us       2.048us             6  
+                                Activity Buffer Request        15.30%     220.275us        15.30%     220.275us     220.275us       2.848us         1.61%       2.848us       2.848us             1  
+                                    aten::empty_strided         2.11%      30.450us         2.11%      30.450us       5.075us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        15.99%     230.296us        15.99%     230.296us      38.383us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.74%      68.240us         6.08%      87.483us       3.645us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.34%      19.243us         1.34%      19.243us       0.802us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.64%     225.174us        15.64%     225.174us       4.691us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.35%       5.110us         0.35%       5.110us       5.110us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.688ms
-Self CUDA time total: 177.119us
+Self CPU time total: 1.440ms
+Self CUDA time total: 176.988us
 
 
 
@@ -4864,27 +4864,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     955.007us       321.87%     955.007us     955.007us             1  
-                                            torch_eager        21.61%     290.382us        99.58%       1.338ms       1.338ms       0.000us         0.00%     314.050us     314.050us             1  
-                                              aten::mul        12.35%     165.965us        20.49%     275.388us      11.475us     146.274us        49.30%     146.274us       6.095us            24  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     146.274us        49.30%     146.274us       6.095us            24  
-                                            aten::copy_         7.99%     107.375us        38.18%     513.111us      28.506us     109.984us        37.07%     127.328us       7.074us            18  
-                                            aten::clone         1.53%      20.570us        31.98%     429.868us      71.645us       0.000us         0.00%      70.048us      11.675us             6  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.280us        19.31%      57.280us       4.773us            12  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      52.704us        17.76%      52.704us       8.784us             6  
-void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      40.448us        13.63%      40.448us       3.371us            12  
-                                              aten::sub         2.99%      40.150us         4.79%      64.400us      10.733us      20.288us         6.84%      20.288us       3.381us             6  
-                                              aten::add         2.45%      32.907us         4.13%      55.499us       9.250us      20.160us         6.79%      20.160us       3.360us             6  
-                                Activity Buffer Request        11.77%     158.223us        11.77%     158.223us     158.223us      17.344us         5.85%      17.344us      17.344us             1  
-                                    aten::empty_strided         2.28%      30.711us         2.28%      30.711us       5.118us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        13.78%     185.224us        13.78%     185.224us      30.871us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         5.13%      68.942us         6.58%      88.372us       3.682us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.45%      19.430us         1.45%      19.430us       0.810us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        16.26%     218.554us        16.26%     218.554us       4.553us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.42%       5.611us         0.42%       5.611us       5.611us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     931.347us       313.60%     931.347us     931.347us             1  
+                                            torch_eager        20.13%     283.358us        99.65%       1.403ms       1.403ms       0.000us         0.00%     314.679us     314.679us             1  
+                                              aten::mul        10.72%     150.883us        18.79%     264.457us      11.019us     145.371us        48.95%     145.371us       6.057us            24  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     145.371us        48.95%     145.371us       6.057us            24  
+                                            aten::copy_         7.40%     104.164us        42.97%     604.868us      33.604us     110.845us        37.32%     128.541us       7.141us            18  
+                                            aten::clone         1.53%      21.600us        37.15%     522.944us      87.157us       0.000us         0.00%      71.357us      11.893us             6  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.184us        19.25%      57.184us       4.765us            12  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.661us        18.07%      53.661us       8.944us             6  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      40.767us        13.73%      40.767us       3.397us            12  
+                                              aten::add         2.28%      32.151us         3.88%      54.682us       9.114us      20.446us         6.88%      20.446us       3.408us             6  
+                                              aten::sub         2.39%      33.622us         4.06%      57.171us       9.528us      20.321us         6.84%      20.321us       3.387us             6  
+                                Activity Buffer Request        14.77%     207.975us        14.77%     207.975us     207.975us      17.696us         5.96%      17.696us      17.696us             1  
+                                    aten::empty_strided         2.15%      30.270us         2.15%      30.270us       5.045us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        16.22%     228.377us        16.22%     228.377us      38.063us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.75%      66.830us         6.13%      86.290us       3.595us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.38%      19.460us         1.38%      19.460us       0.811us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.91%     224.006us        15.91%     224.006us       4.667us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.35%       4.971us         0.35%       4.971us       4.971us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.344ms
-Self CUDA time total: 296.706us
+Self CPU time total: 1.408ms
+Self CUDA time total: 296.983us
 
 
 
@@ -4894,27 +4894,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     962.939us       164.48%     962.939us     962.939us             1  
-                                            torch_eager        21.30%     292.019us        99.59%       1.365ms       1.365ms       0.000us         0.00%     609.117us     609.117us             1  
-                                            aten::copy_         7.59%     104.052us        39.10%     536.059us      29.781us     268.735us        45.90%     292.415us      16.245us            18  
-                                              aten::mul        11.61%     159.130us        19.77%     271.083us      11.295us     251.454us        42.95%     251.454us      10.477us            24  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     251.454us        42.95%     251.454us      10.477us            24  
-                                            aten::clone         1.60%      21.919us        33.19%     455.067us      75.844us       0.000us         0.00%     201.504us      33.584us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us     177.824us        30.37%     177.824us      29.637us             6  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      90.911us        15.53%      90.911us       7.576us            12  
-void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      65.248us        11.15%      65.248us       5.437us            12  
-                                              aten::sub         2.98%      40.869us         4.94%      67.700us      11.283us      32.703us         5.59%      32.703us       5.451us             6  
-                                              aten::add         2.40%      32.850us         4.07%      55.841us       9.307us      32.545us         5.56%      32.545us       5.424us             6  
-                                Activity Buffer Request        13.18%     180.724us        13.18%     180.724us     180.724us      23.680us         4.04%      23.680us      23.680us             1  
-                                    aten::empty_strided         2.23%      30.541us         2.23%      30.541us       5.090us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        13.71%     188.023us        13.71%     188.023us      31.337us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         5.13%      70.322us         6.59%      90.292us       3.762us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.46%      19.970us         1.46%      19.970us       0.832us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        16.41%     225.035us        16.41%     225.035us       4.688us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.41%       5.640us         0.41%       5.640us       5.640us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     931.511us       159.85%     931.511us     931.511us             1  
+                                            torch_eager        19.89%     283.237us        99.62%       1.419ms       1.419ms       0.000us         0.00%     606.457us     606.457us             1  
+                                            aten::copy_         7.21%     102.593us        43.52%     619.697us      34.428us     267.708us        45.94%     291.419us      16.190us            18  
+                                              aten::mul        10.56%     150.425us        18.55%     264.165us      11.007us     249.406us        42.80%     249.406us      10.392us            24  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     249.406us        42.80%     249.406us      10.392us            24  
+                                            aten::clone         1.52%      21.631us        38.04%     541.603us      90.267us       0.000us         0.00%     201.277us      33.546us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us     177.566us        30.47%     177.566us      29.594us             6  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      90.142us        15.47%      90.142us       7.512us            12  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      65.632us        11.26%      65.632us       5.469us            12  
+                                              aten::add         2.16%      30.762us         3.77%      53.662us       8.944us      32.832us         5.63%      32.832us       5.472us             6  
+                                              aten::sub         2.53%      36.013us         4.23%      60.192us      10.032us      32.800us         5.63%      32.800us       5.467us             6  
+                                Activity Buffer Request        14.90%     212.145us        14.90%     212.145us     212.145us      23.711us         4.07%      23.711us      23.711us             1  
+                                    aten::empty_strided         2.14%      30.440us         2.14%      30.440us       5.073us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        16.99%     241.846us        16.99%     241.846us      40.308us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.71%      67.093us         6.00%      85.482us       3.562us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.29%      18.389us         1.29%      18.389us       0.766us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.73%     223.932us        15.73%     223.932us       4.665us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.38%       5.360us         0.38%       5.360us       5.360us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.371ms
-Self CUDA time total: 585.437us
+Self CPU time total: 1.424ms
+Self CUDA time total: 582.746us
 
 
 
@@ -4924,55 +4924,61 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         9.18%     318.848us        77.56%       2.693ms       2.693ms       0.000us         0.00%       1.840ms       1.840ms             1  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.811ms       102.06%       1.811ms       1.811ms             1  
-                                            aten::copy_         3.19%     110.682us        53.02%       1.841ms     102.257us     792.737us        44.68%     858.369us      47.687us            18  
-                                              aten::mul         4.39%     152.554us         7.57%     262.845us      10.952us     833.316us        46.97%     833.316us      34.721us            24  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     833.316us        46.97%     833.316us      34.721us            24  
-                                            aten::clone         0.79%      27.538us        50.82%       1.764ms     294.050us       0.000us         0.00%     624.865us     104.144us             6  
-                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us     559.233us        31.52%     559.233us      93.206us             6  
-void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     233.504us        13.16%     233.504us      19.459us            12  
-void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     148.032us         8.34%     148.032us      12.336us            12  
-                                              aten::sub         1.13%      39.132us         1.88%      65.111us      10.852us      90.112us         5.08%      90.112us      15.019us             6  
-                                Activity Buffer Request        41.37%       1.436ms        41.37%       1.436ms       1.436ms      65.632us         3.70%      65.632us      65.632us             1  
-                                              aten::add         0.97%      33.650us         1.61%      56.062us       9.344us      57.920us         3.26%      57.920us       9.653us             6  
-                                    aten::empty_strided         0.92%      31.941us         0.92%      31.941us       5.324us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         6.62%     229.834us         6.62%     229.834us      38.306us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.00%      69.363us         2.59%      89.831us       3.743us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.59%      20.468us         0.59%      20.468us       0.853us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         6.41%     222.613us         6.41%     222.613us       4.638us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize        22.44%     778.913us        22.44%     778.913us     778.913us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager        13.84%     306.170us        64.60%       1.429ms       1.429ms       0.000us         0.00%       1.835ms       1.835ms             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.808ms       102.17%       1.808ms       1.808ms             1  
+                                            aten::copy_         5.17%     114.346us        26.90%     594.995us      33.055us     791.984us        44.77%     858.095us      47.672us            18  
+                                              aten::mul         6.78%     150.032us        12.17%     269.044us      11.210us     828.790us        46.85%     828.790us      34.533us            24  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     828.790us        46.85%     828.790us      34.533us            24  
+                                            aten::clone         1.04%      23.090us        22.74%     502.934us      83.822us       0.000us         0.00%     626.230us     104.372us             6  
+                         Memcpy DtoD (Device -&gt; Device)         0.00%       0.000us         0.00%       0.000us       0.000us     560.119us        31.66%     560.119us      93.353us             6  
+void at::native::elementwise_kernel&lt;128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     231.865us        13.11%     231.865us      19.322us            12  
+void at::native::vectorized_elementwise_kernel&lt;4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     148.413us         8.39%     148.413us      12.368us            12  
+                                              aten::sub         1.69%      37.309us         2.75%      60.900us      10.150us      90.142us         5.10%      90.142us      15.024us             6  
+                                Activity Buffer Request         8.38%     185.324us         8.38%     185.324us     185.324us      66.111us         3.74%      66.111us      66.111us             1  
+                                              aten::add         1.41%      31.181us         2.49%      55.022us       9.170us      58.271us         3.29%      58.271us       9.712us             6  
+                                    aten::empty_strided         1.45%      31.982us         1.45%      31.982us       5.330us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        10.29%     227.584us        10.29%     227.584us      37.931us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.11%      68.695us         3.96%      87.553us       3.648us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.85%      18.858us         0.85%      18.858us       0.786us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        10.59%     234.185us        10.59%     234.185us       4.879us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize        35.40%     782.770us        35.40%     782.770us     782.770us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 3.472ms
-Self CUDA time total: 1.774ms
+Self CPU time total: 2.212ms
+Self CUDA time total: 1.769ms
 
 
 impl                     wl                  p50(ms)  ok
-torch_eager              cuda_B1_S128_H32_D128_R64     0.22  True
-torch_eager              cuda_B1_S128_H32_D64_R32     0.23  True
-torch_eager              cuda_B1_S128_H8_D128_R64     0.23  True
+torch_eager              cuda_B1_S128_H32_D128_R64     0.21  True
+torch_eager              cuda_B1_S128_H32_D64_R32     0.22  True
+torch_eager              cuda_B1_S128_H8_D128_R64     0.22  True
 torch_eager              cuda_B1_S128_H8_D64_R32     0.17  True
-torch_eager              cuda_B1_S2048_H32_D128_R64     0.23  True
-torch_eager              cuda_B1_S2048_H32_D64_R32     0.22  True
-torch_eager              cuda_B1_S2048_H8_D128_R64     0.22  True
+torch_eager              cuda_B1_S2048_H32_D128_R64     0.22  True
+torch_eager              cuda_B1_S2048_H32_D64_R32     0.21  True
+torch_eager              cuda_B1_S2048_H8_D128_R64     0.21  True
 torch_eager              cuda_B1_S2048_H8_D64_R32     0.22  True
 torch_eager              cuda_B1_S512_H32_D128_R64     0.22  True
-torch_eager              cuda_B1_S512_H32_D64_R32     0.22  True
-torch_eager              cuda_B1_S512_H8_D128_R64     0.22  True
-torch_eager              cuda_B1_S512_H8_D64_R32     0.22  True
-torch_eager              cuda_B2_S128_H32_D128_R64     0.22  True
+torch_eager              cuda_B1_S512_H32_D64_R32     0.21  True
+torch_eager              cuda_B1_S512_H8_D128_R64     0.21  True
+torch_eager              cuda_B1_S512_H8_D64_R32     0.21  True
+torch_eager              cuda_B2_S128_H32_D128_R64     0.21  True
 torch_eager              cuda_B2_S128_H32_D64_R32     0.22  True
-torch_eager              cuda_B2_S128_H8_D128_R64     0.22  True
+torch_eager              cuda_B2_S128_H8_D128_R64     0.21  True
 torch_eager              cuda_B2_S128_H8_D64_R32     0.22  True
 torch_eager              cuda_B2_S2048_H32_D128_R64     0.64  True
 torch_eager              cuda_B2_S2048_H32_D64_R32     0.23  True
-torch_eager              cuda_B2_S2048_H8_D128_R64     0.23  True
+torch_eager              cuda_B2_S2048_H8_D128_R64     0.22  True
 torch_eager              cuda_B2_S2048_H8_D64_R32     0.22  True
 torch_eager              cuda_B2_S512_H32_D128_R64     0.22  True
-torch_eager              cuda_B2_S512_H32_D64_R32     0.23  True
-torch_eager              cuda_B2_S512_H8_D128_R64     0.22  True
-torch_eager              cuda_B2_S512_H8_D64_R32     0.22  True
+torch_eager              cuda_B2_S512_H32_D64_R32     0.22  True
+torch_eager              cuda_B2_S512_H8_D128_R64     0.21  True
+torch_eager              cuda_B2_S512_H8_D64_R32     0.21  True
 </pre></div>
+<div class="uv-install-logs" id="uv-logs-benchmark">
+<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
+<div class="uv-logs-content" style="display: none;">
+Installed 37 packages in 238ms
+</div>
+</div>
 <div class="cell-artifacts">
 <h4>Artifacts:</h4>
 <a href="artifacts/benchmark/rotary.jsonl" class="artifact" target="_blank">rotary.jsonl</a>
diff --git a/rotary/results/artifacts/combine/latency.svg b/rotary/results/artifacts/combine/latency.svg
index 7e204b4a84d9a5e16538227357f7ff28e8f5c02e..36f9217b1247fd55602a048202775fbf3d19cd24 100644
--- a/rotary/results/artifacts/combine/latency.svg
+++ b/rotary/results/artifacts/combine/latency.svg
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b5f227e4c6029d72861d1a351c2c4353a8589dfeadaaf2aa034c7c28ec49a733
-size 37854
+oid sha256:1df6ff7a8f4a24eba95824695c07fcf25601f7f648a0a0773f7d1bc7119d9fd2
+size 37849
diff --git a/rotary/results/combined_results.html b/rotary/results/combined_results.html
index 7f3eb837fca79886475bc80b225a1aa3afc35557..28fa630c24a01c9c2557497761cffaee1e2da610 100644
--- a/rotary/results/combined_results.html
+++ b/rotary/results/combined_results.html
@@ -4107,7 +4107,7 @@ body[data-tool="eraser"] .main-content {
   <rdf:RDF>
    <ns2:Work>
     <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
-    <dc:date>2025-10-30T15:53:49.568408</dc:date>
+    <dc:date>2025-10-31T20:14:10.200761</dc:date>
     <dc:format>image/svg+xml</dc:format>
     <dc:creator>
      <ns2:Agent>
@@ -4451,109 +4451,109 @@ body[data-tool="eraser"] .main-content {
    <g id="matplotlib.axis_2">
     <g id="ytick_1">
      <g id="grid-y--2" class="grid grid-y">
-      <path d="M 47.72 394.205114  L 823.142937 394.205114  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 395.136814  L 823.142937 395.136814  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_25">
       <defs>
        <path id="m0fca2865ba" d="M 0 0  L -3.5 0  " style="stroke: #000000; stroke-width: 0.8" />
       </defs>
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="394.205114" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="395.136814" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_25">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="398.004332" transform="rotate(-0 40.72 398.004332)">0.1</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="398.936033" transform="rotate(-0 40.72 398.936033)">0.1</text>
      </g>
     </g>
     <g id="ytick_2">
      <g id="grid-y--3" class="grid grid-y">
-      <path d="M 47.72 347.649754  L 823.142937 347.649754  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 348.364838  L 823.142937 348.364838  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_26">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="347.649754" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="348.364838" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_26">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="351.448973" transform="rotate(-0 40.72 351.448973)">0.2</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="352.164057" transform="rotate(-0 40.72 352.164057)">0.2</text>
      </g>
     </g>
     <g id="ytick_3">
      <g id="grid-y--4" class="grid grid-y">
-      <path d="M 47.72 301.094395  L 823.142937 301.094395  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 301.592862  L 823.142937 301.592862  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_27">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="301.094395" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="301.592862" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_27">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="304.893614" transform="rotate(-0 40.72 304.893614)">0.3</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="305.392081" transform="rotate(-0 40.72 305.392081)">0.3</text>
      </g>
     </g>
     <g id="ytick_4">
      <g id="grid-y--5" class="grid grid-y">
-      <path d="M 47.72 254.539036  L 823.142937 254.539036  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 254.820886  L 823.142937 254.820886  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_28">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="254.539036" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="254.820886" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_28">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="258.338254" transform="rotate(-0 40.72 258.338254)">0.4</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="258.620105" transform="rotate(-0 40.72 258.620105)">0.4</text>
      </g>
     </g>
     <g id="ytick_5">
      <g id="grid-y--6" class="grid grid-y">
-      <path d="M 47.72 207.983676  L 823.142937 207.983676  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 208.04891  L 823.142937 208.04891  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_29">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="207.983676" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="208.04891" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_29">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="211.782895" transform="rotate(-0 40.72 211.782895)">0.5</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="211.848129" transform="rotate(-0 40.72 211.848129)">0.5</text>
      </g>
     </g>
     <g id="ytick_6">
      <g id="grid-y--7" class="grid grid-y">
-      <path d="M 47.72 161.428317  L 823.142937 161.428317  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 161.276934  L 823.142937 161.276934  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_30">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="161.428317" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="161.276934" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_30">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="165.227536" transform="rotate(-0 40.72 165.227536)">0.6</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="165.076153" transform="rotate(-0 40.72 165.076153)">0.6</text>
      </g>
     </g>
     <g id="ytick_7">
      <g id="grid-y--8" class="grid grid-y">
-      <path d="M 47.72 114.872958  L 823.142937 114.872958  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 114.504958  L 823.142937 114.504958  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_31">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="114.872958" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="114.504958" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_31">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="118.672177" transform="rotate(-0 40.72 118.672177)">0.7</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="118.304177" transform="rotate(-0 40.72 118.304177)">0.7</text>
      </g>
     </g>
     <g id="ytick_8">
      <g id="grid-y--9" class="grid grid-y">
-      <path d="M 47.72 68.317598  L 823.142937 68.317598  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 67.732982  L 823.142937 67.732982  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_32">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="68.317598" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="67.732982" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_32">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="72.116817" transform="rotate(-0 40.72 72.116817)">0.8</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="71.532201" transform="rotate(-0 40.72 71.532201)">0.8</text>
      </g>
     </g>
     <g id="label--y" class="ylabel">
@@ -4561,67 +4561,67 @@ body[data-tool="eraser"] .main-content {
     </g>
    </g>
    <g id="series--hf-kernels-rotary" class="series">
-    <path d="M 82.966497 405.060892  L 113.615625 396.648339  L 144.264753 396.988193  L 174.913881 397.328047  L 205.563009 397.9193  L 236.212137 398.296399  L 266.861265 397.928611  L 297.510393 397.980288  L 328.159521 397.70049  L 358.808648 397.737734  L 389.457776 397.835501  L 420.106904 319.039158  L 450.756032 398.031033  L 481.40516 396.727483  L 512.054288 398.040344  L 542.703416 398.213065  L 573.352544 397.845277  L 604.001672 397.961666  L 634.6508 397.910455  L 665.299928 398.012411  L 695.949056 397.858778  L 726.598184 398.189321  L 757.247312 317.740264  L 787.896439 44.888614  " clip-path="url(#p088c925177)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 82.966497 405.060892  L 113.615625 398.507671  L 144.264753 398.999244  L 174.913881 399.546476  L 205.563009 398.13864  L 236.212137 398.232184  L 266.861265 397.806559  L 297.510393 398.166703  L 328.159521 398.321051  L 358.808648 396.96934  L 389.457776 397.08627  L 420.106904 319.115048  L 450.756032 397.886071  L 481.40516 397.67513  L 512.054288 397.914134  L 542.703416 397.867362  L 573.352544 397.867362  L 604.001672 397.703193  L 634.6508 397.479155  L 665.299928 397.87204  L 695.949056 398.353791  L 726.598184 397.825268  L 757.247312 317.698325  L 787.896439 44.888614  " clip-path="url(#p088c925177)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="md7efaf3aec" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #1f77b4" />
     </defs>
     <g clip-path="url(#p088c925177)">
      <use ns4:href="#md7efaf3aec" x="82.966497" y="405.060892" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="113.615625" y="396.648339" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="144.264753" y="396.988193" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="174.913881" y="397.328047" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="205.563009" y="397.9193" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="236.212137" y="398.296399" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="266.861265" y="397.928611" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="297.510393" y="397.980288" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="328.159521" y="397.70049" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="358.808648" y="397.737734" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="389.457776" y="397.835501" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="420.106904" y="319.039158" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="450.756032" y="398.031033" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="481.40516" y="396.727483" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="512.054288" y="398.040344" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="542.703416" y="398.213065" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="573.352544" y="397.845277" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="604.001672" y="397.961666" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="634.6508" y="397.910455" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="665.299928" y="398.012411" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="695.949056" y="397.858778" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="726.598184" y="398.189321" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="757.247312" y="317.740264" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="113.615625" y="398.507671" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="144.264753" y="398.999244" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="174.913881" y="399.546476" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="205.563009" y="398.13864" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="236.212137" y="398.232184" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="266.861265" y="397.806559" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="297.510393" y="398.166703" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="328.159521" y="398.321051" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="358.808648" y="396.96934" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="389.457776" y="397.08627" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="420.106904" y="319.115048" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="450.756032" y="397.886071" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="481.40516" y="397.67513" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="512.054288" y="397.914134" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="542.703416" y="397.867362" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="573.352544" y="397.867362" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="604.001672" y="397.703193" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="634.6508" y="397.479155" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="665.299928" y="397.87204" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="695.949056" y="398.353791" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="726.598184" y="397.825268" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="757.247312" y="317.698325" style="fill: #1f77b4; stroke: #1f77b4" />
      <use ns4:href="#md7efaf3aec" x="787.896439" y="44.888614" style="fill: #1f77b4; stroke: #1f77b4" />
     </g>
    </g>
    <g id="series--torch-eager" class="series">
-    <path d="M 82.966497 359.440365  L 113.615625 335.608676  L 144.264753 335.091912  L 174.913881 338.332165  L 205.563009 336.162685  L 236.212137 338.443898  L 266.861265 338.211121  L 297.510393 337.051892  L 328.159521 336.51185  L 358.808648 336.423395  L 389.457776 336.549094  L 420.106904 334.34237  L 450.756032 336.241829  L 481.40516 336.596115  L 512.054288 336.51185  L 542.703416 336.050952  L 573.352544 336.940625  L 604.001672 337.345191  L 634.6508 335.492288  L 665.299928 336.567717  L 695.949056 337.317258  L 726.598184 335.822831  L 757.247312 333.928493  L 787.896439 144.328068  " clip-path="url(#p088c925177)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 82.966497 361.575583  L 113.615625 339.878063  L 144.264753 340.574966  L 174.913881 342.240048  L 205.563009 341.482342  L 236.212137 341.968303  L 266.861265 341.748475  L 297.510393 341.164293  L 328.159521 341.014155  L 358.808648 341.968771  L 389.457776 341.585241  L 420.106904 337.094663  L 450.756032 339.171807  L 481.40516 341.757829  L 512.054288 341.051572  L 542.703416 341.930885  L 573.352544 341.660076  L 604.001672 342.081023  L 634.6508 340.82239  L 665.299928 339.19987  L 695.949056 339.512774  L 726.598184 340.046443  L 757.247312 334.732678  L 787.896439 143.851567  " clip-path="url(#p088c925177)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="m9b8c54d372" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #ff7f0e" />
     </defs>
     <g clip-path="url(#p088c925177)">
-     <use ns4:href="#m9b8c54d372" x="82.966497" y="359.440365" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="113.615625" y="335.608676" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="144.264753" y="335.091912" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="174.913881" y="338.332165" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="205.563009" y="336.162685" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="236.212137" y="338.443898" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="266.861265" y="338.211121" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="297.510393" y="337.051892" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="328.159521" y="336.51185" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="358.808648" y="336.423395" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="389.457776" y="336.549094" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="420.106904" y="334.34237" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="450.756032" y="336.241829" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="481.40516" y="336.596115" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="512.054288" y="336.51185" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="542.703416" y="336.050952" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="573.352544" y="336.940625" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="604.001672" y="337.345191" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="634.6508" y="335.492288" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="665.299928" y="336.567717" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="695.949056" y="337.317258" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="726.598184" y="335.822831" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="757.247312" y="333.928493" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="787.896439" y="144.328068" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="82.966497" y="361.575583" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="113.615625" y="339.878063" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="144.264753" y="340.574966" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="174.913881" y="342.240048" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="205.563009" y="341.482342" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="236.212137" y="341.968303" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="266.861265" y="341.748475" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="297.510393" y="341.164293" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="328.159521" y="341.014155" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="358.808648" y="341.968771" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="389.457776" y="341.585241" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="420.106904" y="337.094663" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="450.756032" y="339.171807" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="481.40516" y="341.757829" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="512.054288" y="341.051572" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="542.703416" y="341.930885" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="573.352544" y="341.660076" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="604.001672" y="342.081023" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="634.6508" y="340.82239" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="665.299928" y="339.19987" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="695.949056" y="339.512774" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="726.598184" y="340.046443" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="757.247312" y="334.732678" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="787.896439" y="143.851567" style="fill: #ff7f0e; stroke: #ff7f0e" />
     </g>
    </g>
    <g id="patch_3">
@@ -4679,7 +4679,7 @@ body[data-tool="eraser"] .main-content {
 <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
 </span> | 
-Cell: combine | 4.37s
+Cell: combine | 4.46s
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
 <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -4771,8 +4771,8 @@ hf_kernels_rotary        cuda_B1_S128_H32_D64_R32     0.09  True
 hf_kernels_rotary        cuda_B1_S128_H8_D128_R64     0.09  True
 hf_kernels_rotary        cuda_B1_S128_H8_D64_R32     0.08  True
 hf_kernels_rotary        cuda_B1_S2048_H32_D128_R64     0.26  True
-hf_kernels_rotary        cuda_B1_S2048_H32_D64_R32     0.09  True
-hf_kernels_rotary        cuda_B1_S2048_H8_D128_R64     0.09  True
+hf_kernels_rotary        cuda_B1_S2048_H32_D64_R32     0.10  True
+hf_kernels_rotary        cuda_B1_S2048_H8_D128_R64     0.10  True
 hf_kernels_rotary        cuda_B1_S2048_H8_D64_R32     0.09  True
 hf_kernels_rotary        cuda_B1_S512_H32_D128_R64     0.09  True
 hf_kernels_rotary        cuda_B1_S512_H32_D64_R32     0.09  True
@@ -4783,37 +4783,37 @@ hf_kernels_rotary        cuda_B2_S128_H32_D64_R32     0.09  True
 hf_kernels_rotary        cuda_B2_S128_H8_D128_R64     0.09  True
 hf_kernels_rotary        cuda_B2_S128_H8_D64_R32     0.09  True
 hf_kernels_rotary        cuda_B2_S2048_H32_D128_R64     0.85  True
-hf_kernels_rotary        cuda_B2_S2048_H32_D64_R32     0.26  True
+hf_kernels_rotary        cuda_B2_S2048_H32_D64_R32     0.27  True
 hf_kernels_rotary        cuda_B2_S2048_H8_D128_R64     0.09  True
 hf_kernels_rotary        cuda_B2_S2048_H8_D64_R32     0.09  True
 hf_kernels_rotary        cuda_B2_S512_H32_D128_R64     0.09  True
 hf_kernels_rotary        cuda_B2_S512_H32_D64_R32     0.09  True
 hf_kernels_rotary        cuda_B2_S512_H8_D128_R64     0.09  True
 hf_kernels_rotary        cuda_B2_S512_H8_D64_R32     0.09  True
-torch_eager              cuda_B1_S128_H32_D128_R64     0.22  True
-torch_eager              cuda_B1_S128_H32_D64_R32     0.23  True
-torch_eager              cuda_B1_S128_H8_D128_R64     0.23  True
+torch_eager              cuda_B1_S128_H32_D128_R64     0.21  True
+torch_eager              cuda_B1_S128_H32_D64_R32     0.22  True
+torch_eager              cuda_B1_S128_H8_D128_R64     0.22  True
 torch_eager              cuda_B1_S128_H8_D64_R32     0.17  True
-torch_eager              cuda_B1_S2048_H32_D128_R64     0.23  True
-torch_eager              cuda_B1_S2048_H32_D64_R32     0.22  True
-torch_eager              cuda_B1_S2048_H8_D128_R64     0.22  True
+torch_eager              cuda_B1_S2048_H32_D128_R64     0.22  True
+torch_eager              cuda_B1_S2048_H32_D64_R32     0.21  True
+torch_eager              cuda_B1_S2048_H8_D128_R64     0.21  True
 torch_eager              cuda_B1_S2048_H8_D64_R32     0.22  True
 torch_eager              cuda_B1_S512_H32_D128_R64     0.22  True
-torch_eager              cuda_B1_S512_H32_D64_R32     0.22  True
-torch_eager              cuda_B1_S512_H8_D128_R64     0.22  True
-torch_eager              cuda_B1_S512_H8_D64_R32     0.22  True
-torch_eager              cuda_B2_S128_H32_D128_R64     0.22  True
+torch_eager              cuda_B1_S512_H32_D64_R32     0.21  True
+torch_eager              cuda_B1_S512_H8_D128_R64     0.21  True
+torch_eager              cuda_B1_S512_H8_D64_R32     0.21  True
+torch_eager              cuda_B2_S128_H32_D128_R64     0.21  True
 torch_eager              cuda_B2_S128_H32_D64_R32     0.22  True
-torch_eager              cuda_B2_S128_H8_D128_R64     0.22  True
+torch_eager              cuda_B2_S128_H8_D128_R64     0.21  True
 torch_eager              cuda_B2_S128_H8_D64_R32     0.22  True
 torch_eager              cuda_B2_S2048_H32_D128_R64     0.64  True
 torch_eager              cuda_B2_S2048_H32_D64_R32     0.23  True
-torch_eager              cuda_B2_S2048_H8_D128_R64     0.23  True
+torch_eager              cuda_B2_S2048_H8_D128_R64     0.22  True
 torch_eager              cuda_B2_S2048_H8_D64_R32     0.22  True
 torch_eager              cuda_B2_S512_H32_D128_R64     0.22  True
-torch_eager              cuda_B2_S512_H32_D64_R32     0.23  True
-torch_eager              cuda_B2_S512_H8_D128_R64     0.22  True
-torch_eager              cuda_B2_S512_H8_D64_R32     0.22  True
+torch_eager              cuda_B2_S512_H32_D64_R32     0.22  True
+torch_eager              cuda_B2_S512_H8_D128_R64     0.21  True
+torch_eager              cuda_B2_S512_H8_D64_R32     0.21  True
 
 GENERATING COMBINED VISUALIZATION
 
@@ -4833,7 +4833,7 @@ Implementations included:
 <div class="uv-install-logs" id="uv-logs-combine">
 <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 <div class="uv-logs-content" style="display: none;">
-Installed 37 packages in 193ms
+Installed 37 packages in 229ms
 </div>
 </div>
 <div class="cell-artifacts">
@@ -4846,7 +4846,7 @@ Installed 37 packages in 193ms
   <rdf:RDF>
    <ns2:Work>
     <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
-    <dc:date>2025-10-30T15:53:49.568408</dc:date>
+    <dc:date>2025-10-31T20:14:10.200761</dc:date>
     <dc:format>image/svg+xml</dc:format>
     <dc:creator>
      <ns2:Agent>
@@ -5190,109 +5190,109 @@ Installed 37 packages in 193ms
    <g id="matplotlib.axis_2">
     <g id="ytick_1">
      <g id="grid-y--2" class="grid grid-y">
-      <path d="M 47.72 394.205114  L 823.142937 394.205114  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 395.136814  L 823.142937 395.136814  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_25">
       <defs>
        <path id="m0fca2865ba" d="M 0 0  L -3.5 0  " style="stroke: #000000; stroke-width: 0.8" />
       </defs>
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="394.205114" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="395.136814" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_25">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="398.004332" transform="rotate(-0 40.72 398.004332)">0.1</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="398.936033" transform="rotate(-0 40.72 398.936033)">0.1</text>
      </g>
     </g>
     <g id="ytick_2">
      <g id="grid-y--3" class="grid grid-y">
-      <path d="M 47.72 347.649754  L 823.142937 347.649754  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 348.364838  L 823.142937 348.364838  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_26">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="347.649754" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="348.364838" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_26">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="351.448973" transform="rotate(-0 40.72 351.448973)">0.2</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="352.164057" transform="rotate(-0 40.72 352.164057)">0.2</text>
      </g>
     </g>
     <g id="ytick_3">
      <g id="grid-y--4" class="grid grid-y">
-      <path d="M 47.72 301.094395  L 823.142937 301.094395  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 301.592862  L 823.142937 301.592862  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_27">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="301.094395" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="301.592862" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_27">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="304.893614" transform="rotate(-0 40.72 304.893614)">0.3</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="305.392081" transform="rotate(-0 40.72 305.392081)">0.3</text>
      </g>
     </g>
     <g id="ytick_4">
      <g id="grid-y--5" class="grid grid-y">
-      <path d="M 47.72 254.539036  L 823.142937 254.539036  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 254.820886  L 823.142937 254.820886  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_28">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="254.539036" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="254.820886" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_28">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="258.338254" transform="rotate(-0 40.72 258.338254)">0.4</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="258.620105" transform="rotate(-0 40.72 258.620105)">0.4</text>
      </g>
     </g>
     <g id="ytick_5">
      <g id="grid-y--6" class="grid grid-y">
-      <path d="M 47.72 207.983676  L 823.142937 207.983676  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 208.04891  L 823.142937 208.04891  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_29">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="207.983676" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="208.04891" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_29">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="211.782895" transform="rotate(-0 40.72 211.782895)">0.5</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="211.848129" transform="rotate(-0 40.72 211.848129)">0.5</text>
      </g>
     </g>
     <g id="ytick_6">
      <g id="grid-y--7" class="grid grid-y">
-      <path d="M 47.72 161.428317  L 823.142937 161.428317  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 161.276934  L 823.142937 161.276934  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_30">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="161.428317" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="161.276934" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_30">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="165.227536" transform="rotate(-0 40.72 165.227536)">0.6</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="165.076153" transform="rotate(-0 40.72 165.076153)">0.6</text>
      </g>
     </g>
     <g id="ytick_7">
      <g id="grid-y--8" class="grid grid-y">
-      <path d="M 47.72 114.872958  L 823.142937 114.872958  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 114.504958  L 823.142937 114.504958  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_31">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="114.872958" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="114.504958" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_31">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="118.672177" transform="rotate(-0 40.72 118.672177)">0.7</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="118.304177" transform="rotate(-0 40.72 118.304177)">0.7</text>
      </g>
     </g>
     <g id="ytick_8">
      <g id="grid-y--9" class="grid grid-y">
-      <path d="M 47.72 68.317598  L 823.142937 68.317598  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
+      <path d="M 47.72 67.732982  L 823.142937 67.732982  " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
      </g>
      <g id="line2d_32">
       <g>
-       <use ns4:href="#m0fca2865ba" x="47.72" y="68.317598" style="stroke: #000000; stroke-width: 0.8" />
+       <use ns4:href="#m0fca2865ba" x="47.72" y="67.732982" style="stroke: #000000; stroke-width: 0.8" />
       </g>
      </g>
      <g id="text_32">
-      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="72.116817" transform="rotate(-0 40.72 72.116817)">0.8</text>
+      <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="71.532201" transform="rotate(-0 40.72 71.532201)">0.8</text>
      </g>
     </g>
     <g id="label--y" class="ylabel">
@@ -5300,67 +5300,67 @@ Installed 37 packages in 193ms
     </g>
    </g>
    <g id="series--hf-kernels-rotary" class="series">
-    <path d="M 82.966497 405.060892  L 113.615625 396.648339  L 144.264753 396.988193  L 174.913881 397.328047  L 205.563009 397.9193  L 236.212137 398.296399  L 266.861265 397.928611  L 297.510393 397.980288  L 328.159521 397.70049  L 358.808648 397.737734  L 389.457776 397.835501  L 420.106904 319.039158  L 450.756032 398.031033  L 481.40516 396.727483  L 512.054288 398.040344  L 542.703416 398.213065  L 573.352544 397.845277  L 604.001672 397.961666  L 634.6508 397.910455  L 665.299928 398.012411  L 695.949056 397.858778  L 726.598184 398.189321  L 757.247312 317.740264  L 787.896439 44.888614  " clip-path="url(#p088c925177)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 82.966497 405.060892  L 113.615625 398.507671  L 144.264753 398.999244  L 174.913881 399.546476  L 205.563009 398.13864  L 236.212137 398.232184  L 266.861265 397.806559  L 297.510393 398.166703  L 328.159521 398.321051  L 358.808648 396.96934  L 389.457776 397.08627  L 420.106904 319.115048  L 450.756032 397.886071  L 481.40516 397.67513  L 512.054288 397.914134  L 542.703416 397.867362  L 573.352544 397.867362  L 604.001672 397.703193  L 634.6508 397.479155  L 665.299928 397.87204  L 695.949056 398.353791  L 726.598184 397.825268  L 757.247312 317.698325  L 787.896439 44.888614  " clip-path="url(#p088c925177)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="md7efaf3aec" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #1f77b4" />
     </defs>
     <g clip-path="url(#p088c925177)">
      <use ns4:href="#md7efaf3aec" x="82.966497" y="405.060892" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="113.615625" y="396.648339" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="144.264753" y="396.988193" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="174.913881" y="397.328047" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="205.563009" y="397.9193" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="236.212137" y="398.296399" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="266.861265" y="397.928611" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="297.510393" y="397.980288" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="328.159521" y="397.70049" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="358.808648" y="397.737734" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="389.457776" y="397.835501" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="420.106904" y="319.039158" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="450.756032" y="398.031033" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="481.40516" y="396.727483" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="512.054288" y="398.040344" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="542.703416" y="398.213065" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="573.352544" y="397.845277" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="604.001672" y="397.961666" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="634.6508" y="397.910455" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="665.299928" y="398.012411" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="695.949056" y="397.858778" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="726.598184" y="398.189321" style="fill: #1f77b4; stroke: #1f77b4" />
-     <use ns4:href="#md7efaf3aec" x="757.247312" y="317.740264" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="113.615625" y="398.507671" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="144.264753" y="398.999244" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="174.913881" y="399.546476" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="205.563009" y="398.13864" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="236.212137" y="398.232184" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="266.861265" y="397.806559" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="297.510393" y="398.166703" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="328.159521" y="398.321051" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="358.808648" y="396.96934" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="389.457776" y="397.08627" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="420.106904" y="319.115048" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="450.756032" y="397.886071" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="481.40516" y="397.67513" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="512.054288" y="397.914134" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="542.703416" y="397.867362" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="573.352544" y="397.867362" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="604.001672" y="397.703193" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="634.6508" y="397.479155" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="665.299928" y="397.87204" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="695.949056" y="398.353791" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="726.598184" y="397.825268" style="fill: #1f77b4; stroke: #1f77b4" />
+     <use ns4:href="#md7efaf3aec" x="757.247312" y="317.698325" style="fill: #1f77b4; stroke: #1f77b4" />
      <use ns4:href="#md7efaf3aec" x="787.896439" y="44.888614" style="fill: #1f77b4; stroke: #1f77b4" />
     </g>
    </g>
    <g id="series--torch-eager" class="series">
-    <path d="M 82.966497 359.440365  L 113.615625 335.608676  L 144.264753 335.091912  L 174.913881 338.332165  L 205.563009 336.162685  L 236.212137 338.443898  L 266.861265 338.211121  L 297.510393 337.051892  L 328.159521 336.51185  L 358.808648 336.423395  L 389.457776 336.549094  L 420.106904 334.34237  L 450.756032 336.241829  L 481.40516 336.596115  L 512.054288 336.51185  L 542.703416 336.050952  L 573.352544 336.940625  L 604.001672 337.345191  L 634.6508 335.492288  L 665.299928 336.567717  L 695.949056 337.317258  L 726.598184 335.822831  L 757.247312 333.928493  L 787.896439 144.328068  " clip-path="url(#p088c925177)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
+    <path d="M 82.966497 361.575583  L 113.615625 339.878063  L 144.264753 340.574966  L 174.913881 342.240048  L 205.563009 341.482342  L 236.212137 341.968303  L 266.861265 341.748475  L 297.510393 341.164293  L 328.159521 341.014155  L 358.808648 341.968771  L 389.457776 341.585241  L 420.106904 337.094663  L 450.756032 339.171807  L 481.40516 341.757829  L 512.054288 341.051572  L 542.703416 341.930885  L 573.352544 341.660076  L 604.001672 342.081023  L 634.6508 340.82239  L 665.299928 339.19987  L 695.949056 339.512774  L 726.598184 340.046443  L 757.247312 334.732678  L 787.896439 143.851567  " clip-path="url(#p088c925177)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
     <defs>
      <path id="m9b8c54d372" d="M 0 3  C 0.795609 3 1.55874 2.683901 2.12132 2.12132  C 2.683901 1.55874 3 0.795609 3 0  C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132  C 1.55874 -2.683901 0.795609 -3 0 -3  C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132  C -2.683901 -1.55874 -3 -0.795609 -3 0  C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132  C -1.55874 2.683901 -0.795609 3 0 3  z " style="stroke: #ff7f0e" />
     </defs>
     <g clip-path="url(#p088c925177)">
-     <use ns4:href="#m9b8c54d372" x="82.966497" y="359.440365" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="113.615625" y="335.608676" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="144.264753" y="335.091912" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="174.913881" y="338.332165" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="205.563009" y="336.162685" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="236.212137" y="338.443898" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="266.861265" y="338.211121" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="297.510393" y="337.051892" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="328.159521" y="336.51185" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="358.808648" y="336.423395" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="389.457776" y="336.549094" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="420.106904" y="334.34237" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="450.756032" y="336.241829" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="481.40516" y="336.596115" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="512.054288" y="336.51185" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="542.703416" y="336.050952" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="573.352544" y="336.940625" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="604.001672" y="337.345191" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="634.6508" y="335.492288" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="665.299928" y="336.567717" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="695.949056" y="337.317258" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="726.598184" y="335.822831" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="757.247312" y="333.928493" style="fill: #ff7f0e; stroke: #ff7f0e" />
-     <use ns4:href="#m9b8c54d372" x="787.896439" y="144.328068" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="82.966497" y="361.575583" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="113.615625" y="339.878063" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="144.264753" y="340.574966" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="174.913881" y="342.240048" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="205.563009" y="341.482342" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="236.212137" y="341.968303" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="266.861265" y="341.748475" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="297.510393" y="341.164293" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="328.159521" y="341.014155" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="358.808648" y="341.968771" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="389.457776" y="341.585241" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="420.106904" y="337.094663" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="450.756032" y="339.171807" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="481.40516" y="341.757829" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="512.054288" y="341.051572" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="542.703416" y="341.930885" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="573.352544" y="341.660076" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="604.001672" y="342.081023" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="634.6508" y="340.82239" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="665.299928" y="339.19987" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="695.949056" y="339.512774" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="726.598184" y="340.046443" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="757.247312" y="334.732678" style="fill: #ff7f0e; stroke: #ff7f0e" />
+     <use ns4:href="#m9b8c54d372" x="787.896439" y="143.851567" style="fill: #ff7f0e; stroke: #ff7f0e" />
     </g>
    </g>
    <g id="patch_3">