drbh HF Staff commited on
Commit
970ee75
·
verified ·
1 Parent(s): 80669f0

Upload folder using huggingface_hub

Browse files
flash_attn/impls/artifacts/benchmark/attn.jsonl CHANGED
@@ -1,6 +1,6 @@
1
- {"ts": "2025-10-02T18:12:20Z", "run": "a2d84c2da2864a0ead6e0da36e5784e9", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.4474239945411682, "p50": 0.44921600818634033, "p90": 0.45241600275039673, "mean": 0.45066879987716674, "reps": 5, "warmup": 2}, "compile_ms": 1.7530560493469238, "peak_bytes": 87425024, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
2
- {"ts": "2025-10-02T18:12:20Z", "run": "a2d84c2da2864a0ead6e0da36e5784e9", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.4652479887008667, "p50": 0.4705919921398163, "p90": 0.4716799855232239, "mean": 0.47004159688949587, "reps": 5, "warmup": 2}, "compile_ms": 0.36032000184059143, "peak_bytes": 95027200, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
3
- {"ts": "2025-10-02T18:12:20Z", "run": "a2d84c2da2864a0ead6e0da36e5784e9", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5987840294837952, "p50": 0.6021760106086731, "p90": 0.6045759916305542, "mean": 0.6022783994674683, "reps": 5, "warmup": 2}, "compile_ms": 0.4950079917907715, "peak_bytes": 99680256, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.905726432800293e-06, "ref": "sdpa_math_fp32"}, "err": null}
4
- {"ts": "2025-10-02T18:12:20Z", "run": "a2d84c2da2864a0ead6e0da36e5784e9", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6000319719314575, "p50": 0.600383996963501, "p90": 0.6016640067100525, "mean": 0.6013055920600892, "reps": 5, "warmup": 2}, "compile_ms": 0.49647998809814453, "peak_bytes": 104726528, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
5
- {"ts": "2025-10-02T18:12:20Z", "run": "a2d84c2da2864a0ead6e0da36e5784e9", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.639136016368866, "p50": 0.6404479742050171, "p90": 0.6416320204734802, "mean": 0.6408192038536071, "reps": 5, "warmup": 2}, "compile_ms": 0.530239999294281, "peak_bytes": 108855296, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003566741943359375, "mse": 2.86102294921875e-06, "ref": "sdpa_math_fp32"}, "err": null}
6
- {"ts": "2025-10-02T18:12:20Z", "run": "a2d84c2da2864a0ead6e0da36e5784e9", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6447359919548035, "p50": 0.6462399959564209, "p90": 0.6483839750289917, "mean": 0.6466111898422241, "reps": 5, "warmup": 2}, "compile_ms": 0.5342720150947571, "peak_bytes": 114425856, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8759241104125977e-06, "ref": "sdpa_math_fp32"}, "err": null}
 
1
+ {"ts": "2025-10-02T19:59:35Z", "run": "8bc1bbc1e0504355abbb1f58e69828d3", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.3603839874267578, "p50": 0.361952006816864, "p90": 0.3624640107154846, "mean": 0.3619711995124817, "reps": 5, "warmup": 2}, "compile_ms": 1.5701119899749756, "peak_bytes": 87425024, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
2
+ {"ts": "2025-10-02T19:59:35Z", "run": "8bc1bbc1e0504355abbb1f58e69828d3", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.3892799913883209, "p50": 0.3909760117530823, "p90": 0.3922559916973114, "mean": 0.3912447988986969, "reps": 5, "warmup": 2}, "compile_ms": 0.35811200737953186, "peak_bytes": 95027200, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
3
+ {"ts": "2025-10-02T19:59:35Z", "run": "8bc1bbc1e0504355abbb1f58e69828d3", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5240640044212341, "p50": 0.5248960256576538, "p90": 0.5248960256576538, "mean": 0.5258048176765442, "reps": 5, "warmup": 2}, "compile_ms": 0.4891839921474457, "peak_bytes": 99680256, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.905726432800293e-06, "ref": "sdpa_math_fp32"}, "err": null}
4
+ {"ts": "2025-10-02T19:59:35Z", "run": "8bc1bbc1e0504355abbb1f58e69828d3", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5265600085258484, "p50": 0.5277760028839111, "p90": 0.5282559990882874, "mean": 0.5276032090187073, "reps": 5, "warmup": 2}, "compile_ms": 0.4968000054359436, "peak_bytes": 104726528, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
5
+ {"ts": "2025-10-02T19:59:35Z", "run": "8bc1bbc1e0504355abbb1f58e69828d3", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5639039874076843, "p50": 0.5657920241355896, "p90": 0.5668479800224304, "mean": 0.5656383991241455, "reps": 5, "warmup": 2}, "compile_ms": 0.5312319993972778, "peak_bytes": 108855296, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003566741943359375, "mse": 2.86102294921875e-06, "ref": "sdpa_math_fp32"}, "err": null}
6
+ {"ts": "2025-10-02T19:59:35Z", "run": "8bc1bbc1e0504355abbb1f58e69828d3", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5689600110054016, "p50": 0.5698239803314209, "p90": 0.5713919997215271, "mean": 0.5789952039718628, "reps": 5, "warmup": 2}, "compile_ms": 0.5350080132484436, "peak_bytes": 114425856, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8759241104125977e-06, "ref": "sdpa_math_fp32"}, "err": null}
flash_attn/impls/artifacts/benchmark_default/attn_default.jsonl CHANGED
@@ -1,6 +1,6 @@
1
- {"ts": "2025-10-02T18:08:46Z", "run": "3ea490632d4f4be2a19f477a48f12fc5", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5194560289382935, "p50": 0.5272960066795349, "p90": 0.5312960147857666, "mean": 0.527347207069397, "reps": 5, "warmup": 2}, "compile_ms": 3354.235107421875, "peak_bytes": 87425024, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000339508056640625, "mse": 2.726912498474121e-06, "ref": "sdpa_math_fp32"}, "err": null}
2
- {"ts": "2025-10-02T18:08:47Z", "run": "3ea490632d4f4be2a19f477a48f12fc5", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5560640096664429, "p50": 0.5571519732475281, "p90": 0.5611839890480042, "mean": 0.5586367964744567, "reps": 5, "warmup": 2}, "compile_ms": 471.23529052734375, "peak_bytes": 95027200, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003414154052734375, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
3
- {"ts": "2025-10-02T18:08:47Z", "run": "3ea490632d4f4be2a19f477a48f12fc5", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6841920018196106, "p50": 0.6860160231590271, "p90": 0.6869760155677795, "mean": 0.6860736012458801, "reps": 5, "warmup": 2}, "compile_ms": 468.1533508300781, "peak_bytes": 99876864, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
4
- {"ts": "2025-10-02T18:08:48Z", "run": "3ea490632d4f4be2a19f477a48f12fc5", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.7152000069618225, "p50": 0.7161920070648193, "p90": 0.7164160013198853, "mean": 0.7167360067367554, "reps": 5, "warmup": 2}, "compile_ms": 465.7891540527344, "peak_bytes": 104726528, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
5
- {"ts": "2025-10-02T18:08:48Z", "run": "3ea490632d4f4be2a19f477a48f12fc5", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.7400959730148315, "p50": 0.742143988609314, "p90": 0.7431039810180664, "mean": 0.7423295855522156, "reps": 5, "warmup": 2}, "compile_ms": 468.6272888183594, "peak_bytes": 108855296, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
6
- {"ts": "2025-10-02T18:08:49Z", "run": "3ea490632d4f4be2a19f477a48f12fc5", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.7718080282211304, "p50": 0.7745919823646545, "p90": 0.7748159766197205, "mean": 0.7743871927261352, "reps": 5, "warmup": 2}, "compile_ms": 475.9334716796875, "peak_bytes": 114425856, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}
 
1
+ {"ts": "2025-10-02T19:58:18Z", "run": "9ebc449a917f4f2196503654e5549239", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5141760110855103, "p50": 0.5175679922103882, "p90": 0.5197759866714478, "mean": 0.5181439876556396, "reps": 5, "warmup": 2}, "compile_ms": 3084.621826171875, "peak_bytes": 87425024, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000339508056640625, "mse": 2.726912498474121e-06, "ref": "sdpa_math_fp32"}, "err": null}
2
+ {"ts": "2025-10-02T19:58:19Z", "run": "9ebc449a917f4f2196503654e5549239", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5549119710922241, "p50": 0.5582720041275024, "p90": 0.5598080158233643, "mean": 0.5579584002494812, "reps": 5, "warmup": 2}, "compile_ms": 270.21795654296875, "peak_bytes": 95027200, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003414154052734375, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
3
+ {"ts": "2025-10-02T19:58:19Z", "run": "9ebc449a917f4f2196503654e5549239", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6853119730949402, "p50": 0.687391996383667, "p90": 0.6883519887924194, "mean": 0.6872959971427918, "reps": 5, "warmup": 2}, "compile_ms": 269.78741455078125, "peak_bytes": 99876864, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
4
+ {"ts": "2025-10-02T19:58:19Z", "run": "9ebc449a917f4f2196503654e5549239", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.7128639817237854, "p50": 0.7160959839820862, "p90": 0.7167680263519287, "mean": 0.716153597831726, "reps": 5, "warmup": 2}, "compile_ms": 269.8607177734375, "peak_bytes": 104726528, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
5
+ {"ts": "2025-10-02T19:58:19Z", "run": "9ebc449a917f4f2196503654e5549239", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.7386879920959473, "p50": 0.7400959730148315, "p90": 0.7415040135383606, "mean": 0.7418303966522217, "reps": 5, "warmup": 2}, "compile_ms": 269.20501708984375, "peak_bytes": 108855296, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
6
+ {"ts": "2025-10-02T19:58:20Z", "run": "9ebc449a917f4f2196503654e5549239", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.7708160281181335, "p50": 0.7740799784660339, "p90": 0.7753919959068298, "mean": 0.7745471954345703, "reps": 5, "warmup": 2}, "compile_ms": 270.93829345703125, "peak_bytes": 114425856, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}
flash_attn/impls/artifacts/benchmark_max_autotune/attn_max_autotune.jsonl CHANGED
@@ -1,6 +1,6 @@
1
- {"ts": "2025-10-02T18:09:34Z", "run": "02313c2372ed4884add9b1c644335af9", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6148160099983215, "p50": 0.6296960115432739, "p90": 0.6522240042686462, "mean": 0.6489088058471679, "reps": 5, "warmup": 2}, "compile_ms": 4649.109375, "peak_bytes": 70779904, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000339508056640625, "mse": 2.726912498474121e-06, "ref": "sdpa_math_fp32"}, "err": null}
2
- {"ts": "2025-10-02T18:09:35Z", "run": "02313c2372ed4884add9b1c644335af9", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6615359783172607, "p50": 0.6821119785308838, "p90": 0.7128959894180298, "mean": 0.700761592388153, "reps": 5, "warmup": 2}, "compile_ms": 1487.6849365234375, "peak_bytes": 78644224, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003414154052734375, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
3
- {"ts": "2025-10-02T18:09:37Z", "run": "02313c2372ed4884add9b1c644335af9", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.7967039942741394, "p50": 0.8164799809455872, "p90": 0.8463680148124695, "mean": 0.834444797039032, "reps": 5, "warmup": 2}, "compile_ms": 1492.66748046875, "peak_bytes": 84280320, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
4
- {"ts": "2025-10-02T18:09:39Z", "run": "02313c2372ed4884add9b1c644335af9", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.8432319760322571, "p50": 0.8498560190200806, "p90": 0.8750079870223999, "mean": 0.8709375977516174, "reps": 5, "warmup": 2}, "compile_ms": 1477.6558837890625, "peak_bytes": 86508544, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
5
- {"ts": "2025-10-02T18:09:41Z", "run": "02313c2372ed4884add9b1c644335af9", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.8775359988212585, "p50": 0.9030719995498657, "p90": 0.903872013092041, "mean": 0.9069631934165955, "reps": 5, "warmup": 2}, "compile_ms": 1919.1016845703125, "peak_bytes": 90440704, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
6
- {"ts": "2025-10-02T18:09:43Z", "run": "02313c2372ed4884add9b1c644335af9", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.9145920276641846, "p50": 0.9164159893989563, "p90": 0.9357439875602722, "mean": 0.9371584057807922, "reps": 5, "warmup": 2}, "compile_ms": 1487.1219482421875, "peak_bytes": 94372864, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}
 
1
+ {"ts": "2025-10-02T19:57:25Z", "run": "edb73be653834cdf8524ee78b403db7f", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6144000291824341, "p50": 0.6245759725570679, "p90": 0.6483200192451477, "mean": 0.6468096017837525, "reps": 5, "warmup": 2}, "compile_ms": 4407.3388671875, "peak_bytes": 70779904, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000339508056640625, "mse": 2.726912498474121e-06, "ref": "sdpa_math_fp32"}, "err": null}
2
+ {"ts": "2025-10-02T19:57:27Z", "run": "edb73be653834cdf8524ee78b403db7f", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6689280271530151, "p50": 0.6851199865341187, "p90": 0.7184960246086121, "mean": 0.7060160160064697, "reps": 5, "warmup": 2}, "compile_ms": 1686.2735595703125, "peak_bytes": 78644224, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003414154052734375, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
3
+ {"ts": "2025-10-02T19:57:29Z", "run": "edb73be653834cdf8524ee78b403db7f", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.7953600287437439, "p50": 0.8155840039253235, "p90": 0.8403519988059998, "mean": 0.8332608103752136, "reps": 5, "warmup": 2}, "compile_ms": 1462.938232421875, "peak_bytes": 84280320, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
4
+ {"ts": "2025-10-02T19:57:31Z", "run": "edb73be653834cdf8524ee78b403db7f", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.8470720052719116, "p50": 0.849727988243103, "p90": 0.8745279908180237, "mean": 0.8719295978546142, "reps": 5, "warmup": 2}, "compile_ms": 1689.3455810546875, "peak_bytes": 86508544, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
5
+ {"ts": "2025-10-02T19:57:33Z", "run": "edb73be653834cdf8524ee78b403db7f", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.8677120208740234, "p50": 0.8835520148277283, "p90": 0.9034240245819092, "mean": 0.9034304022789001, "reps": 5, "warmup": 2}, "compile_ms": 1693.035888671875, "peak_bytes": 90440704, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
6
+ {"ts": "2025-10-02T19:57:34Z", "run": "edb73be653834cdf8524ee78b403db7f", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.9154239892959595, "p50": 0.9213759899139404, "p90": 0.9359679818153381, "mean": 0.9387519836425782, "reps": 5, "warmup": 2}, "compile_ms": 1689.36279296875, "peak_bytes": 94372864, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}
flash_attn/impls/cells/benchmark.py CHANGED
@@ -4,7 +4,7 @@
4
  # "numpy",
5
  # "torch",
6
  # "kernels-benchmark-tools",
7
- # "xformers",
8
  # ]
9
  #
10
  # [tool.uv.sources]
@@ -14,32 +14,35 @@ import torch
14
  import sys
15
  import os
16
  import kernels_benchmark_tools as kbt
17
- import xformers.ops as xops
18
 
 
19
 
20
- def xformers_attention(q, k, v):
21
- """xFormers memory efficient attention"""
22
- # xFormers expects [batch, seq_len, heads, head_dim]
23
- return xops.memory_efficient_attention(q, k, v)
24
 
25
 
26
  kbt.add(
27
- "xformers_meff",
28
- xformers_attention,
29
- tags={"family": "xformers", "backend": "memory_efficient", "compile": "none"},
30
  )
31
 
32
  if __name__ == "__main__":
33
  device = "cuda" if torch.cuda.is_available() else "cpu"
34
- dtype = "float32" if device == "cpu" else "bfloat16"
 
 
 
 
 
35
 
36
  # Flux-like workloads
37
- base = 1024 if device == "cuda" else 512
38
- flux_sizes = (
39
- [128, 256, 320, 384, 448, 512] if device == "cuda" else [64, 128, 192, 256]
40
- )
41
- heads = 24 if device == "cuda" else 8
42
- head_dim = 128 if device == "cuda" else 64
43
 
44
  wl = []
45
  for L in flux_sizes:
 
4
  # "numpy",
5
  # "torch",
6
  # "kernels-benchmark-tools",
7
+ # "kernels",
8
  # ]
9
  #
10
  # [tool.uv.sources]
 
14
  import sys
15
  import os
16
  import kernels_benchmark_tools as kbt
17
+ from kernels import get_kernel
18
 
19
+ hf_kernels_flash_attn3 = get_kernel("kernels-community/flash-attn3")
20
 
21
+
22
+ def hf_flash_attention3(query, key, value):
23
+ return hf_kernels_flash_attn3.flash_attn_func(query, key, value, causal=False)[0]
 
24
 
25
 
26
  kbt.add(
27
+ "hf_kernels_flash_attn3",
28
+ hf_flash_attention3,
29
+ tags={"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"},
30
  )
31
 
32
  if __name__ == "__main__":
33
  device = "cuda" if torch.cuda.is_available() else "cpu"
34
+
35
+ if device == "cpu":
36
+ print("HF Kernels Flash Attention 3 requires CUDA - skipping benchmark")
37
+ sys.exit(0)
38
+
39
+ dtype = "bfloat16"
40
 
41
  # Flux-like workloads
42
+ base = 1024
43
+ flux_sizes = [128, 256, 320, 384, 448, 512]
44
+ heads = 24
45
+ head_dim = 128
 
 
46
 
47
  wl = []
48
  for L in flux_sizes:
flash_attn/impls/compiled_variants.html CHANGED
@@ -101,10 +101,12 @@
101
  :root[data-ui="monocolor"] a { color: var(--mono-color); }
102
  :root[data-ui="monocolor"] .menu-button,
103
  :root[data-ui="monocolor"] .theme-toggle,
104
- :root[data-ui="monocolor"] .reset-toggle { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); }
 
105
  :root[data-ui="monocolor"] .menu-button:hover,
106
  :root[data-ui="monocolor"] .theme-toggle:hover,
107
- :root[data-ui="monocolor"] .reset-toggle:hover { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); }
 
108
  :root[data-ui="monocolor"] .menu-dropdown { background: var(--bg-primary); border-color: var(--mono-color); box-shadow: none; }
109
  :root[data-ui="monocolor"] .menu-item { color: var(--mono-color); border-bottom-color: var(--mono-color); }
110
  :root[data-ui="monocolor"] .system-info { background: var(--bg-primary); border-color: var(--mono-color); }
@@ -214,7 +216,8 @@
214
  /* Keep default control styling when widgets are enabled, even in minimal UI */
215
  :root[data-ui="none"][data-widgets="on"] .menu-button,
216
  :root[data-ui="none"][data-widgets="on"] .theme-toggle,
217
- :root[data-ui="none"][data-widgets="on"] .reset-toggle {
 
218
  background: #f6f6f6;
219
  border: 1px solid #cccccc;
220
  color: #222222;
@@ -244,7 +247,8 @@
244
  :root[data-ui="none"][data-widgets="on"] .status-widget { background: #f6f6f6; border-color: #ccc; color: #222; }
245
  :root[data-ui="none"][data-widgets="on"] .menu-button:hover,
246
  :root[data-ui="none"][data-widgets="on"] .theme-toggle:hover,
247
- :root[data-ui="none"][data-widgets="on"] .reset-toggle:hover {
 
248
  background: #ededed;
249
  border-color: #bbbbbb;
250
  color: #000000;
@@ -302,7 +306,8 @@
302
  }
303
 
304
  .theme-toggle,
305
- .reset-toggle {
 
306
  background: var(--bg-secondary);
307
  border: 1px solid var(--border-primary);
308
  padding: 8px 12px;
@@ -313,9 +318,15 @@
313
  font-size: 0.9rem;
314
  user-select: none;
315
  }
316
-
 
 
 
 
 
317
  .theme-toggle:hover,
318
- .reset-toggle:hover {
 
319
  color: var(--text-primary);
320
  background: var(--bg-tertiary);
321
  }
@@ -3697,6 +3708,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3697
  <body>
3698
  <div class="controls">
3699
  <div class="controls-buttons">
 
 
 
3700
  <div class="theme-toggle" onclick="toggleTheme()">light</div>
3701
  <div class="reset-toggle" onclick="resetLayout()">reset</div>
3702
  <div class="menu-button" onclick="toggleMenu()">
@@ -3746,7 +3760,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3746
  <span onclick="toggleOutput('benchmark_default')" style="cursor: pointer;">▼ output</span>
3747
  <span id="uv-indicator-benchmark_default" onclick="toggleUvLogsFromHeader('benchmark_default')" style="cursor: pointer;">▶ uv-logs</span>
3748
  </span> |
3749
- Cell: benchmark_default | 46.78s
3750
  | <button class="run-btn" onclick="runCell('benchmark_default')">▶ run</button>
3751
  <button class="copy-btn" onclick="copyCell('benchmark_default')">Copy</button>
3752
  <a href="cells/benchmark_default.py" target="_blank" class="raw-btn">Raw</a>
@@ -3830,7 +3844,7 @@ Cell: benchmark_default | 46.78s
3830
  </div>
3831
  <div id="output-benchmark_default" class="cell-output">
3832
  <div class="cell-stdout">impl wl p50(ms) ok
3833
- torch_flash_compiled_default flux_L128 0.53 True
3834
  torch_flash_compiled_default flux_L256 0.56 True
3835
  torch_flash_compiled_default flux_L320 0.69 True
3836
  torch_flash_compiled_default flux_L384 0.72 True
@@ -3841,41 +3855,41 @@ torch_flash_compiled_default flux_L512 0.77 True
3841
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3842
  <div class="uv-logs-content" style="display: none;">
3843
  Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
3844
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
3845
- Downloading matplotlib (8.3MiB)
3846
  Downloading networkx (1.9MiB)
3847
  Downloading setuptools (1.1MiB)
3848
- Downloading nvidia-cusparse-cu12 (274.9MiB)
3849
- Downloading pillow (6.3MiB)
 
3850
  Downloading sympy (6.0MiB)
3851
- Downloading nvidia-cublas-cu12 (566.8MiB)
3852
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3853
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
3854
  Downloading nvidia-nccl-cu12 (307.4MiB)
3855
- Downloading nvidia-cufft-cu12 (184.2MiB)
3856
- Downloading nvidia-curand-cu12 (60.7MiB)
3857
  Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
 
 
 
3858
  Downloading nvidia-cufile-cu12 (1.1MiB)
3859
- Downloading torch (846.9MiB)
3860
- Downloading nvidia-cusolver-cu12 (255.1MiB)
3861
- Downloading fonttools (4.7MiB)
3862
- Downloading nvidia-cudnn-cu12 (674.0MiB)
3863
- Downloading kiwisolver (1.4MiB)
3864
  Downloading triton (148.3MiB)
3865
- Downloading numpy (16.2MiB)
 
 
 
3866
  Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3867
  Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3868
  Downloading nvidia-cufile-cu12
3869
  Downloading kiwisolver
3870
  Downloading setuptools
3871
- Downloading networkx
3872
  Downloading fonttools
 
3873
  Downloading pillow
3874
  Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3875
  Downloading nvidia-cuda-cupti-cu12
3876
  Downloading matplotlib
3877
- Downloading sympy
3878
  Downloading numpy
 
3879
  Downloading nvidia-nvjitlink-cu12
3880
  Downloading nvidia-curand-cu12
3881
  Downloading nvidia-cuda-nvrtc-cu12
@@ -3888,7 +3902,7 @@ Downloading numpy (16.2MiB)
3888
  Downloading nvidia-cublas-cu12
3889
  Downloading nvidia-cudnn-cu12
3890
  Downloading torch
3891
- Installed 37 packages in 557ms
3892
  </div>
3893
  </div>
3894
  <div class="cell-artifacts">
@@ -3906,7 +3920,7 @@ Installed 37 packages in 557ms
3906
  <span onclick="toggleOutput('benchmark_max_autotune')" style="cursor: pointer;">▼ output</span>
3907
  <span id="uv-indicator-benchmark_max_autotune" onclick="toggleUvLogsFromHeader('benchmark_max_autotune')" style="cursor: pointer;">▶ uv-logs</span>
3908
  </span> |
3909
- Cell: benchmark_max_autotune | 53.65s
3910
  | <button class="run-btn" onclick="runCell('benchmark_max_autotune')">▶ run</button>
3911
  <button class="copy-btn" onclick="copyCell('benchmark_max_autotune')">Copy</button>
3912
  <a href="cells/benchmark_max_autotune.py" target="_blank" class="raw-btn">Raw</a>
@@ -3990,39 +4004,39 @@ Cell: benchmark_max_autotune | 53.65s
3990
  </div>
3991
  <div id="output-benchmark_max_autotune" class="cell-output">
3992
  <div class="cell-stdout">impl wl p50(ms) ok
3993
- torch_flash_compiled_max_autotune flux_L128 0.63 True
3994
- torch_flash_compiled_max_autotune flux_L256 0.68 True
3995
  torch_flash_compiled_max_autotune flux_L320 0.82 True
3996
  torch_flash_compiled_max_autotune flux_L384 0.85 True
3997
- torch_flash_compiled_max_autotune flux_L448 0.90 True
3998
  torch_flash_compiled_max_autotune flux_L512 0.92 True
3999
  </div>
4000
  <div class="uv-install-logs" id="uv-logs-benchmark_max_autotune">
4001
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4002
  <div class="uv-logs-content" style="display: none;">
4003
  Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
4004
- Downloading nvidia-cufile-cu12 (1.1MiB)
4005
- Downloading nvidia-cublas-cu12 (566.8MiB)
4006
- Downloading sympy (6.0MiB)
4007
- Downloading nvidia-nccl-cu12 (307.4MiB)
4008
  Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4009
- Downloading nvidia-cusolver-cu12 (255.1MiB)
4010
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4011
  Downloading matplotlib (8.3MiB)
 
 
 
4012
  Downloading triton (148.3MiB)
4013
- Downloading networkx (1.9MiB)
4014
- Downloading fonttools (4.7MiB)
4015
  Downloading torch (846.9MiB)
4016
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4017
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
4018
- Downloading kiwisolver (1.4MiB)
4019
  Downloading nvidia-cufft-cu12 (184.2MiB)
4020
- Downloading nvidia-cusparse-cu12 (274.9MiB)
4021
  Downloading numpy (16.2MiB)
 
 
 
 
4022
  Downloading pillow (6.3MiB)
4023
- Downloading nvidia-cudnn-cu12 (674.0MiB)
4024
  Downloading setuptools (1.1MiB)
4025
- Downloading nvidia-curand-cu12 (60.7MiB)
4026
  Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
4027
  Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
4028
  Downloading nvidia-cufile-cu12
@@ -4042,13 +4056,13 @@ Downloading nvidia-curand-cu12 (60.7MiB)
4042
  Downloading triton
4043
  Downloading nvidia-cufft-cu12
4044
  Downloading nvidia-cusolver-cu12
4045
- Downloading nvidia-cusparselt-cu12
4046
  Downloading nvidia-cusparse-cu12
 
4047
  Downloading nvidia-nccl-cu12
4048
  Downloading nvidia-cublas-cu12
4049
  Downloading nvidia-cudnn-cu12
4050
  Downloading torch
4051
- Installed 37 packages in 525ms
4052
  </div>
4053
  </div>
4054
  <div class="cell-artifacts">
 
101
  :root[data-ui="monocolor"] a { color: var(--mono-color); }
102
  :root[data-ui="monocolor"] .menu-button,
103
  :root[data-ui="monocolor"] .theme-toggle,
104
+ :root[data-ui="monocolor"] .reset-toggle,
105
+ :root[data-ui="monocolor"] .back-button { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); }
106
  :root[data-ui="monocolor"] .menu-button:hover,
107
  :root[data-ui="monocolor"] .theme-toggle:hover,
108
+ :root[data-ui="monocolor"] .reset-toggle:hover,
109
+ :root[data-ui="monocolor"] .back-button:hover { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); }
110
  :root[data-ui="monocolor"] .menu-dropdown { background: var(--bg-primary); border-color: var(--mono-color); box-shadow: none; }
111
  :root[data-ui="monocolor"] .menu-item { color: var(--mono-color); border-bottom-color: var(--mono-color); }
112
  :root[data-ui="monocolor"] .system-info { background: var(--bg-primary); border-color: var(--mono-color); }
 
216
  /* Keep default control styling when widgets are enabled, even in minimal UI */
217
  :root[data-ui="none"][data-widgets="on"] .menu-button,
218
  :root[data-ui="none"][data-widgets="on"] .theme-toggle,
219
+ :root[data-ui="none"][data-widgets="on"] .reset-toggle,
220
+ :root[data-ui="none"][data-widgets="on"] .back-button {
221
  background: #f6f6f6;
222
  border: 1px solid #cccccc;
223
  color: #222222;
 
247
  :root[data-ui="none"][data-widgets="on"] .status-widget { background: #f6f6f6; border-color: #ccc; color: #222; }
248
  :root[data-ui="none"][data-widgets="on"] .menu-button:hover,
249
  :root[data-ui="none"][data-widgets="on"] .theme-toggle:hover,
250
+ :root[data-ui="none"][data-widgets="on"] .reset-toggle:hover,
251
+ :root[data-ui="none"][data-widgets="on"] .back-button:hover {
252
  background: #ededed;
253
  border-color: #bbbbbb;
254
  color: #000000;
 
306
  }
307
 
308
  .theme-toggle,
309
+ .reset-toggle,
310
+ .back-button {
311
  background: var(--bg-secondary);
312
  border: 1px solid var(--border-primary);
313
  padding: 8px 12px;
 
318
  font-size: 0.9rem;
319
  user-select: none;
320
  }
321
+
322
+ .back-button {
323
+ text-decoration: none;
324
+ display: inline-block;
325
+ }
326
+
327
  .theme-toggle:hover,
328
+ .reset-toggle:hover,
329
+ .back-button:hover {
330
  color: var(--text-primary);
331
  background: var(--bg-tertiary);
332
  }
 
3708
  <body>
3709
  <div class="controls">
3710
  <div class="controls-buttons">
3711
+
3712
+ <a href="index.html" class="back-button">← back</a>
3713
+
3714
  <div class="theme-toggle" onclick="toggleTheme()">light</div>
3715
  <div class="reset-toggle" onclick="resetLayout()">reset</div>
3716
  <div class="menu-button" onclick="toggleMenu()">
 
3760
  <span onclick="toggleOutput('benchmark_default')" style="cursor: pointer;">▼ output</span>
3761
  <span id="uv-indicator-benchmark_default" onclick="toggleUvLogsFromHeader('benchmark_default')" style="cursor: pointer;">▶ uv-logs</span>
3762
  </span> |
3763
+ Cell: benchmark_default | 45.23s
3764
  | <button class="run-btn" onclick="runCell('benchmark_default')">▶ run</button>
3765
  <button class="copy-btn" onclick="copyCell('benchmark_default')">Copy</button>
3766
  <a href="cells/benchmark_default.py" target="_blank" class="raw-btn">Raw</a>
 
3844
  </div>
3845
  <div id="output-benchmark_default" class="cell-output">
3846
  <div class="cell-stdout">impl wl p50(ms) ok
3847
+ torch_flash_compiled_default flux_L128 0.52 True
3848
  torch_flash_compiled_default flux_L256 0.56 True
3849
  torch_flash_compiled_default flux_L320 0.69 True
3850
  torch_flash_compiled_default flux_L384 0.72 True
 
3855
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3856
  <div class="uv-logs-content" style="display: none;">
3857
  Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
3858
+ Downloading pillow (6.3MiB)
3859
+ Downloading numpy (16.2MiB)
3860
  Downloading networkx (1.9MiB)
3861
  Downloading setuptools (1.1MiB)
3862
+ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
3863
+ Downloading fonttools (4.7MiB)
3864
+ Downloading kiwisolver (1.4MiB)
3865
  Downloading sympy (6.0MiB)
3866
+ Downloading torch (846.9MiB)
 
 
3867
  Downloading nvidia-nccl-cu12 (307.4MiB)
 
 
3868
  Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3869
+ Downloading nvidia-cublas-cu12 (566.8MiB)
3870
+ Downloading nvidia-cufft-cu12 (184.2MiB)
3871
+ Downloading matplotlib (8.3MiB)
3872
  Downloading nvidia-cufile-cu12 (1.1MiB)
3873
+ Downloading nvidia-curand-cu12 (60.7MiB)
3874
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
 
 
 
3875
  Downloading triton (148.3MiB)
3876
+ Downloading nvidia-cudnn-cu12 (674.0MiB)
3877
+ Downloading nvidia-cusparselt-cu12 (273.9MiB)
3878
+ Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3879
+ Downloading nvidia-cusolver-cu12 (255.1MiB)
3880
  Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3881
  Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3882
  Downloading nvidia-cufile-cu12
3883
  Downloading kiwisolver
3884
  Downloading setuptools
 
3885
  Downloading fonttools
3886
+ Downloading networkx
3887
  Downloading pillow
3888
  Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3889
  Downloading nvidia-cuda-cupti-cu12
3890
  Downloading matplotlib
 
3891
  Downloading numpy
3892
+ Downloading sympy
3893
  Downloading nvidia-nvjitlink-cu12
3894
  Downloading nvidia-curand-cu12
3895
  Downloading nvidia-cuda-nvrtc-cu12
 
3902
  Downloading nvidia-cublas-cu12
3903
  Downloading nvidia-cudnn-cu12
3904
  Downloading torch
3905
+ Installed 37 packages in 551ms
3906
  </div>
3907
  </div>
3908
  <div class="cell-artifacts">
 
3920
  <span onclick="toggleOutput('benchmark_max_autotune')" style="cursor: pointer;">▼ output</span>
3921
  <span id="uv-indicator-benchmark_max_autotune" onclick="toggleUvLogsFromHeader('benchmark_max_autotune')" style="cursor: pointer;">▶ uv-logs</span>
3922
  </span> |
3923
+ Cell: benchmark_max_autotune | 54.06s
3924
  | <button class="run-btn" onclick="runCell('benchmark_max_autotune')">▶ run</button>
3925
  <button class="copy-btn" onclick="copyCell('benchmark_max_autotune')">Copy</button>
3926
  <a href="cells/benchmark_max_autotune.py" target="_blank" class="raw-btn">Raw</a>
 
4004
  </div>
4005
  <div id="output-benchmark_max_autotune" class="cell-output">
4006
  <div class="cell-stdout">impl wl p50(ms) ok
4007
+ torch_flash_compiled_max_autotune flux_L128 0.62 True
4008
+ torch_flash_compiled_max_autotune flux_L256 0.69 True
4009
  torch_flash_compiled_max_autotune flux_L320 0.82 True
4010
  torch_flash_compiled_max_autotune flux_L384 0.85 True
4011
+ torch_flash_compiled_max_autotune flux_L448 0.88 True
4012
  torch_flash_compiled_max_autotune flux_L512 0.92 True
4013
  </div>
4014
  <div class="uv-install-logs" id="uv-logs-benchmark_max_autotune">
4015
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4016
  <div class="uv-logs-content" style="display: none;">
4017
  Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
4018
+ Downloading nvidia-curand-cu12 (60.7MiB)
 
 
 
4019
  Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4020
+ Downloading networkx (1.9MiB)
 
4021
  Downloading matplotlib (8.3MiB)
4022
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4023
+ Downloading nvidia-cusparselt-cu12 (273.9MiB)
4024
+ Downloading nvidia-cublas-cu12 (566.8MiB)
4025
  Downloading triton (148.3MiB)
4026
+ Downloading nvidia-nccl-cu12 (307.4MiB)
 
4027
  Downloading torch (846.9MiB)
4028
+ Downloading nvidia-cusolver-cu12 (255.1MiB)
4029
+ Downloading nvidia-cudnn-cu12 (674.0MiB)
 
4030
  Downloading nvidia-cufft-cu12 (184.2MiB)
4031
+ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4032
  Downloading numpy (16.2MiB)
4033
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
4034
+ Downloading nvidia-cufile-cu12 (1.1MiB)
4035
+ Downloading sympy (6.0MiB)
4036
+ Downloading kiwisolver (1.4MiB)
4037
  Downloading pillow (6.3MiB)
4038
+ Downloading fonttools (4.7MiB)
4039
  Downloading setuptools (1.1MiB)
 
4040
  Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
4041
  Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
4042
  Downloading nvidia-cufile-cu12
 
4056
  Downloading triton
4057
  Downloading nvidia-cufft-cu12
4058
  Downloading nvidia-cusolver-cu12
 
4059
  Downloading nvidia-cusparse-cu12
4060
+ Downloading nvidia-cusparselt-cu12
4061
  Downloading nvidia-nccl-cu12
4062
  Downloading nvidia-cublas-cu12
4063
  Downloading nvidia-cudnn-cu12
4064
  Downloading torch
4065
+ Installed 37 packages in 513ms
4066
  </div>
4067
  </div>
4068
  <div class="cell-artifacts">
flash_attn/impls/flash_attention.html CHANGED
@@ -101,10 +101,12 @@
101
  :root[data-ui="monocolor"] a { color: var(--mono-color); }
102
  :root[data-ui="monocolor"] .menu-button,
103
  :root[data-ui="monocolor"] .theme-toggle,
104
- :root[data-ui="monocolor"] .reset-toggle { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); }
 
105
  :root[data-ui="monocolor"] .menu-button:hover,
106
  :root[data-ui="monocolor"] .theme-toggle:hover,
107
- :root[data-ui="monocolor"] .reset-toggle:hover { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); }
 
108
  :root[data-ui="monocolor"] .menu-dropdown { background: var(--bg-primary); border-color: var(--mono-color); box-shadow: none; }
109
  :root[data-ui="monocolor"] .menu-item { color: var(--mono-color); border-bottom-color: var(--mono-color); }
110
  :root[data-ui="monocolor"] .system-info { background: var(--bg-primary); border-color: var(--mono-color); }
@@ -214,7 +216,8 @@
214
  /* Keep default control styling when widgets are enabled, even in minimal UI */
215
  :root[data-ui="none"][data-widgets="on"] .menu-button,
216
  :root[data-ui="none"][data-widgets="on"] .theme-toggle,
217
- :root[data-ui="none"][data-widgets="on"] .reset-toggle {
 
218
  background: #f6f6f6;
219
  border: 1px solid #cccccc;
220
  color: #222222;
@@ -244,7 +247,8 @@
244
  :root[data-ui="none"][data-widgets="on"] .status-widget { background: #f6f6f6; border-color: #ccc; color: #222; }
245
  :root[data-ui="none"][data-widgets="on"] .menu-button:hover,
246
  :root[data-ui="none"][data-widgets="on"] .theme-toggle:hover,
247
- :root[data-ui="none"][data-widgets="on"] .reset-toggle:hover {
 
248
  background: #ededed;
249
  border-color: #bbbbbb;
250
  color: #000000;
@@ -302,7 +306,8 @@
302
  }
303
 
304
  .theme-toggle,
305
- .reset-toggle {
 
306
  background: var(--bg-secondary);
307
  border: 1px solid var(--border-primary);
308
  padding: 8px 12px;
@@ -313,9 +318,15 @@
313
  font-size: 0.9rem;
314
  user-select: none;
315
  }
316
-
 
 
 
 
 
317
  .theme-toggle:hover,
318
- .reset-toggle:hover {
 
319
  color: var(--text-primary);
320
  background: var(--bg-tertiary);
321
  }
@@ -3697,6 +3708,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3697
  <body>
3698
  <div class="controls">
3699
  <div class="controls-buttons">
 
 
 
3700
  <div class="theme-toggle" onclick="toggleTheme()">light</div>
3701
  <div class="reset-toggle" onclick="resetLayout()">reset</div>
3702
  <div class="menu-button" onclick="toggleMenu()">
@@ -3745,7 +3759,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3745
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3746
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3747
  </span> |
3748
- Cell: nv | 0.70s
3749
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3750
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3751
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3761,7 +3775,7 @@ Cell: nv | 0.70s
3761
  </div>
3762
  </div>
3763
  <div id="output-nv" class="cell-output">
3764
- <div class="cell-stdout">Thu Oct 2 18:06:49 2025
3765
  +-----------------------------------------------------------------------------------------+
3766
  | NVIDIA-SMI 570.172.08 Driver Version: 570.172.08 CUDA Version: 12.8 |
3767
  |-----------------------------------------+------------------------+----------------------+
@@ -3770,19 +3784,19 @@ Cell: nv | 0.70s
3770
  | | | MIG M. |
3771
  |=========================================+========================+======================|
3772
  | 0 NVIDIA A10G On | 00000000:00:1B.0 Off | 0 |
3773
- | 0% 26C P8 24W / 300W | 0MiB / 23028MiB | 0% Default |
3774
  | | | N/A |
3775
  +-----------------------------------------+------------------------+----------------------+
3776
  | 1 NVIDIA A10G On | 00000000:00:1C.0 Off | 0 |
3777
- | 0% 26C P8 26W / 300W | 0MiB / 23028MiB | 0% Default |
3778
  | | | N/A |
3779
  +-----------------------------------------+------------------------+----------------------+
3780
  | 2 NVIDIA A10G On | 00000000:00:1D.0 Off | 0 |
3781
- | 0% 26C P8 24W / 300W | 0MiB / 23028MiB | 0% Default |
3782
  | | | N/A |
3783
  +-----------------------------------------+------------------------+----------------------+
3784
  | 3 NVIDIA A10G On | 00000000:00:1E.0 Off | 0 |
3785
- | 0% 27C P8 24W / 300W | 0MiB / 23028MiB | 0% Default |
3786
  | | | N/A |
3787
  +-----------------------------------------+------------------------+----------------------+
3788
 
@@ -3806,7 +3820,7 @@ Cell: nv | 0.70s
3806
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3807
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3808
  </span> |
3809
- Cell: benchmark | 36.63s
3810
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3811
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3812
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3886,7 +3900,7 @@ Cell: benchmark | 36.63s
3886
  </div>
3887
  <div id="output-benchmark" class="cell-output">
3888
  <div class="cell-stdout">impl wl p50(ms) ok
3889
- torch_flash_ma flux_L128 0.48 True
3890
  torch_flash_ma flux_L256 0.52 True
3891
  torch_flash_ma flux_L320 0.65 True
3892
  torch_flash_ma flux_L384 0.68 True
@@ -3897,35 +3911,35 @@ torch_flash_ma flux_L512 0.74 True
3897
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3898
  <div class="uv-logs-content" style="display: none;">
3899
  Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
3900
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
 
 
 
 
 
3901
  Downloading networkx (1.9MiB)
3902
- Downloading kiwisolver (1.4MiB)
3903
  Downloading nvidia-cufile-cu12 (1.1MiB)
3904
- Downloading sympy (6.0MiB)
3905
- Downloading nvidia-curand-cu12 (60.7MiB)
3906
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3907
- Downloading nvidia-cufft-cu12 (184.2MiB)
3908
- Downloading nvidia-cusolver-cu12 (255.1MiB)
3909
  Downloading nvidia-nvjitlink-cu12 (37.4MiB)
 
 
3910
  Downloading nvidia-nccl-cu12 (307.4MiB)
3911
- Downloading pillow (6.3MiB)
3912
- Downloading numpy (16.2MiB)
3913
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
3914
  Downloading nvidia-cublas-cu12 (566.8MiB)
3915
- Downloading nvidia-cusparse-cu12 (274.9MiB)
3916
  Downloading setuptools (1.1MiB)
3917
- Downloading matplotlib (8.3MiB)
3918
- Downloading triton (148.3MiB)
3919
- Downloading fonttools (4.7MiB)
3920
- Downloading torch (846.9MiB)
3921
  Downloading nvidia-cudnn-cu12 (674.0MiB)
 
 
 
3922
  Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3923
  Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3924
  Downloading nvidia-cufile-cu12
3925
  Downloading kiwisolver
3926
  Downloading setuptools
3927
- Downloading networkx
3928
  Downloading fonttools
 
3929
  Downloading pillow
3930
  Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3931
  Downloading nvidia-cuda-cupti-cu12
@@ -3941,10 +3955,10 @@ Downloading nvidia-cudnn-cu12 (674.0MiB)
3941
  Downloading nvidia-cusparselt-cu12
3942
  Downloading nvidia-cusparse-cu12
3943
  Downloading nvidia-nccl-cu12
3944
- Downloading nvidia-cudnn-cu12
3945
  Downloading nvidia-cublas-cu12
 
3946
  Downloading torch
3947
- Installed 37 packages in 548ms
3948
  </div>
3949
  </div>
3950
  <div class="cell-artifacts">
 
101
  :root[data-ui="monocolor"] a { color: var(--mono-color); }
102
  :root[data-ui="monocolor"] .menu-button,
103
  :root[data-ui="monocolor"] .theme-toggle,
104
+ :root[data-ui="monocolor"] .reset-toggle,
105
+ :root[data-ui="monocolor"] .back-button { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); }
106
  :root[data-ui="monocolor"] .menu-button:hover,
107
  :root[data-ui="monocolor"] .theme-toggle:hover,
108
+ :root[data-ui="monocolor"] .reset-toggle:hover,
109
+ :root[data-ui="monocolor"] .back-button:hover { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); }
110
  :root[data-ui="monocolor"] .menu-dropdown { background: var(--bg-primary); border-color: var(--mono-color); box-shadow: none; }
111
  :root[data-ui="monocolor"] .menu-item { color: var(--mono-color); border-bottom-color: var(--mono-color); }
112
  :root[data-ui="monocolor"] .system-info { background: var(--bg-primary); border-color: var(--mono-color); }
 
216
  /* Keep default control styling when widgets are enabled, even in minimal UI */
217
  :root[data-ui="none"][data-widgets="on"] .menu-button,
218
  :root[data-ui="none"][data-widgets="on"] .theme-toggle,
219
+ :root[data-ui="none"][data-widgets="on"] .reset-toggle,
220
+ :root[data-ui="none"][data-widgets="on"] .back-button {
221
  background: #f6f6f6;
222
  border: 1px solid #cccccc;
223
  color: #222222;
 
247
  :root[data-ui="none"][data-widgets="on"] .status-widget { background: #f6f6f6; border-color: #ccc; color: #222; }
248
  :root[data-ui="none"][data-widgets="on"] .menu-button:hover,
249
  :root[data-ui="none"][data-widgets="on"] .theme-toggle:hover,
250
+ :root[data-ui="none"][data-widgets="on"] .reset-toggle:hover,
251
+ :root[data-ui="none"][data-widgets="on"] .back-button:hover {
252
  background: #ededed;
253
  border-color: #bbbbbb;
254
  color: #000000;
 
306
  }
307
 
308
  .theme-toggle,
309
+ .reset-toggle,
310
+ .back-button {
311
  background: var(--bg-secondary);
312
  border: 1px solid var(--border-primary);
313
  padding: 8px 12px;
 
318
  font-size: 0.9rem;
319
  user-select: none;
320
  }
321
+
322
+ .back-button {
323
+ text-decoration: none;
324
+ display: inline-block;
325
+ }
326
+
327
  .theme-toggle:hover,
328
+ .reset-toggle:hover,
329
+ .back-button:hover {
330
  color: var(--text-primary);
331
  background: var(--bg-tertiary);
332
  }
 
3708
  <body>
3709
  <div class="controls">
3710
  <div class="controls-buttons">
3711
+
3712
+ <a href="index.html" class="back-button">← back</a>
3713
+
3714
  <div class="theme-toggle" onclick="toggleTheme()">light</div>
3715
  <div class="reset-toggle" onclick="resetLayout()">reset</div>
3716
  <div class="menu-button" onclick="toggleMenu()">
 
3759
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3760
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3761
  </span> |
3762
+ Cell: nv | 0.67s
3763
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3764
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3765
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3775
  </div>
3776
  </div>
3777
  <div id="output-nv" class="cell-output">
3778
+ <div class="cell-stdout">Thu Oct 2 19:58:23 2025
3779
  +-----------------------------------------------------------------------------------------+
3780
  | NVIDIA-SMI 570.172.08 Driver Version: 570.172.08 CUDA Version: 12.8 |
3781
  |-----------------------------------------+------------------------+----------------------+
 
3784
  | | | MIG M. |
3785
  |=========================================+========================+======================|
3786
  | 0 NVIDIA A10G On | 00000000:00:1B.0 Off | 0 |
3787
+ | 0% 37C P0 92W / 300W | 0MiB / 23028MiB | 0% Default |
3788
  | | | N/A |
3789
  +-----------------------------------------+------------------------+----------------------+
3790
  | 1 NVIDIA A10G On | 00000000:00:1C.0 Off | 0 |
3791
+ | 0% 29C P8 24W / 300W | 0MiB / 23028MiB | 0% Default |
3792
  | | | N/A |
3793
  +-----------------------------------------+------------------------+----------------------+
3794
  | 2 NVIDIA A10G On | 00000000:00:1D.0 Off | 0 |
3795
+ | 0% 29C P8 24W / 300W | 0MiB / 23028MiB | 0% Default |
3796
  | | | N/A |
3797
  +-----------------------------------------+------------------------+----------------------+
3798
  | 3 NVIDIA A10G On | 00000000:00:1E.0 Off | 0 |
3799
+ | 0% 30C P8 24W / 300W | 0MiB / 23028MiB | 0% Default |
3800
  | | | N/A |
3801
  +-----------------------------------------+------------------------+----------------------+
3802
 
 
3820
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3821
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3822
  </span> |
3823
+ Cell: benchmark | 35.41s
3824
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3825
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3826
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3900
  </div>
3901
  <div id="output-benchmark" class="cell-output">
3902
  <div class="cell-stdout">impl wl p50(ms) ok
3903
+ torch_flash_ma flux_L128 0.49 True
3904
  torch_flash_ma flux_L256 0.52 True
3905
  torch_flash_ma flux_L320 0.65 True
3906
  torch_flash_ma flux_L384 0.68 True
 
3911
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3912
  <div class="uv-logs-content" style="display: none;">
3913
  Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
3914
+ Downloading triton (148.3MiB)
3915
+ Downloading numpy (16.2MiB)
3916
+ Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3917
+ Downloading matplotlib (8.3MiB)
3918
+ Downloading sympy (6.0MiB)
3919
+ Downloading fonttools (4.7MiB)
3920
  Downloading networkx (1.9MiB)
 
3921
  Downloading nvidia-cufile-cu12 (1.1MiB)
3922
+ Downloading nvidia-cusparselt-cu12 (273.9MiB)
 
 
 
 
3923
  Downloading nvidia-nvjitlink-cu12 (37.4MiB)
3924
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3925
+ Downloading torch (846.9MiB)
3926
  Downloading nvidia-nccl-cu12 (307.4MiB)
 
 
 
3927
  Downloading nvidia-cublas-cu12 (566.8MiB)
 
3928
  Downloading setuptools (1.1MiB)
3929
+ Downloading nvidia-cusolver-cu12 (255.1MiB)
3930
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
3931
+ Downloading kiwisolver (1.4MiB)
 
3932
  Downloading nvidia-cudnn-cu12 (674.0MiB)
3933
+ Downloading nvidia-cufft-cu12 (184.2MiB)
3934
+ Downloading nvidia-curand-cu12 (60.7MiB)
3935
+ Downloading pillow (6.3MiB)
3936
  Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3937
  Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3938
  Downloading nvidia-cufile-cu12
3939
  Downloading kiwisolver
3940
  Downloading setuptools
 
3941
  Downloading fonttools
3942
+ Downloading networkx
3943
  Downloading pillow
3944
  Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3945
  Downloading nvidia-cuda-cupti-cu12
 
3955
  Downloading nvidia-cusparselt-cu12
3956
  Downloading nvidia-cusparse-cu12
3957
  Downloading nvidia-nccl-cu12
 
3958
  Downloading nvidia-cublas-cu12
3959
+ Downloading nvidia-cudnn-cu12
3960
  Downloading torch
3961
+ Installed 37 packages in 491ms
3962
  </div>
3963
  </div>
3964
  <div class="cell-artifacts">
flash_attn/impls/hf_kernels_flash_attn.html CHANGED
@@ -101,10 +101,12 @@
101
  :root[data-ui="monocolor"] a { color: var(--mono-color); }
102
  :root[data-ui="monocolor"] .menu-button,
103
  :root[data-ui="monocolor"] .theme-toggle,
104
- :root[data-ui="monocolor"] .reset-toggle { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); }
 
105
  :root[data-ui="monocolor"] .menu-button:hover,
106
  :root[data-ui="monocolor"] .theme-toggle:hover,
107
- :root[data-ui="monocolor"] .reset-toggle:hover { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); }
 
108
  :root[data-ui="monocolor"] .menu-dropdown { background: var(--bg-primary); border-color: var(--mono-color); box-shadow: none; }
109
  :root[data-ui="monocolor"] .menu-item { color: var(--mono-color); border-bottom-color: var(--mono-color); }
110
  :root[data-ui="monocolor"] .system-info { background: var(--bg-primary); border-color: var(--mono-color); }
@@ -214,7 +216,8 @@
214
  /* Keep default control styling when widgets are enabled, even in minimal UI */
215
  :root[data-ui="none"][data-widgets="on"] .menu-button,
216
  :root[data-ui="none"][data-widgets="on"] .theme-toggle,
217
- :root[data-ui="none"][data-widgets="on"] .reset-toggle {
 
218
  background: #f6f6f6;
219
  border: 1px solid #cccccc;
220
  color: #222222;
@@ -244,7 +247,8 @@
244
  :root[data-ui="none"][data-widgets="on"] .status-widget { background: #f6f6f6; border-color: #ccc; color: #222; }
245
  :root[data-ui="none"][data-widgets="on"] .menu-button:hover,
246
  :root[data-ui="none"][data-widgets="on"] .theme-toggle:hover,
247
- :root[data-ui="none"][data-widgets="on"] .reset-toggle:hover {
 
248
  background: #ededed;
249
  border-color: #bbbbbb;
250
  color: #000000;
@@ -302,7 +306,8 @@
302
  }
303
 
304
  .theme-toggle,
305
- .reset-toggle {
 
306
  background: var(--bg-secondary);
307
  border: 1px solid var(--border-primary);
308
  padding: 8px 12px;
@@ -313,9 +318,15 @@
313
  font-size: 0.9rem;
314
  user-select: none;
315
  }
316
-
 
 
 
 
 
317
  .theme-toggle:hover,
318
- .reset-toggle:hover {
 
319
  color: var(--text-primary);
320
  background: var(--bg-tertiary);
321
  }
@@ -3697,6 +3708,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3697
  <body>
3698
  <div class="controls">
3699
  <div class="controls-buttons">
 
 
 
3700
  <div class="theme-toggle" onclick="toggleTheme()">light</div>
3701
  <div class="reset-toggle" onclick="resetLayout()">reset</div>
3702
  <div class="menu-button" onclick="toggleMenu()">
@@ -3745,7 +3759,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3745
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3746
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3747
  </span> |
3748
- Cell: benchmark | 39.43s
3749
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3750
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3751
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3831,40 +3845,40 @@ Cell: benchmark | 39.43s
3831
  </div>
3832
  <div id="output-benchmark" class="cell-output">
3833
  <div class="cell-stdout">impl wl p50(ms) ok
3834
- hf_kernels_flash_attn flux_L128 0.34 True
3835
  hf_kernels_flash_attn flux_L256 0.38 True
3836
  hf_kernels_flash_attn flux_L320 0.49 True
3837
- hf_kernels_flash_attn flux_L384 0.51 True
3838
  hf_kernels_flash_attn flux_L448 0.54 True
3839
- hf_kernels_flash_attn flux_L512 0.55 True
3840
  </div>
3841
  <div class="uv-install-logs" id="uv-logs-benchmark">
3842
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3843
  <div class="uv-logs-content" style="display: none;">
3844
  Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
 
3845
  Downloading sympy (6.0MiB)
3846
- Downloading matplotlib (8.3MiB)
 
3847
  Downloading nvidia-nccl-cu12 (307.4MiB)
3848
- Downloading nvidia-cufile-cu12 (1.1MiB)
3849
- Downloading nvidia-curand-cu12 (60.7MiB)
3850
  Downloading networkx (1.9MiB)
3851
- Downloading torch (846.9MiB)
3852
- Downloading setuptools (1.1MiB)
 
 
3853
  Downloading nvidia-cudnn-cu12 (674.0MiB)
3854
- Downloading nvidia-cusolver-cu12 (255.1MiB)
3855
  Downloading triton (148.3MiB)
3856
- Downloading nvidia-cusparse-cu12 (274.9MiB)
 
3857
  Downloading numpy (16.2MiB)
3858
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3859
- Downloading kiwisolver (1.4MiB)
3860
- Downloading hf-xet (3.0MiB)
3861
- Downloading pillow (6.3MiB)
3862
  Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3863
- Downloading nvidia-cublas-cu12 (566.8MiB)
3864
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
3865
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
3866
- Downloading fonttools (4.7MiB)
3867
  Downloading nvidia-cufft-cu12 (184.2MiB)
 
 
3868
  Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3869
  Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3870
  Downloading nvidia-cufile-cu12
@@ -3875,29 +3889,29 @@ Downloading nvidia-cufft-cu12 (184.2MiB)
3875
  Downloading fonttools
3876
  Downloading pillow
3877
  Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3878
- Downloading matplotlib
3879
  Downloading nvidia-cuda-cupti-cu12
3880
- Downloading numpy
3881
  Downloading sympy
 
3882
  Downloading nvidia-nvjitlink-cu12
3883
  Downloading nvidia-curand-cu12
3884
  Downloading nvidia-cuda-nvrtc-cu12
3885
  Downloading triton
3886
  Downloading nvidia-cufft-cu12
3887
  Downloading nvidia-cusolver-cu12
3888
- Downloading nvidia-cusparse-cu12
3889
  Downloading nvidia-cusparselt-cu12
 
3890
  Downloading nvidia-nccl-cu12
3891
  Downloading nvidia-cublas-cu12
3892
  Downloading nvidia-cudnn-cu12
3893
  Downloading torch
3894
- Installed 47 packages in 552ms
3895
  </div>
3896
  </div>
3897
  <div class="cell-stderr">Fetching 20 files: 0%| | 0/20 [00:00&lt;?, ?it/s]
3898
- Fetching 20 files: 5%|▌ | 1/20 [00:00&lt;00:03, 5.41it/s]
3899
- Fetching 20 files: 10%|█ | 2/20 [00:01&lt;00:16, 1.09it/s]
3900
- Fetching 20 files: 100%|██████████| 20/20 [00:01&lt;00:00, 12.37it/s]</div>
3901
  <div class="cell-artifacts">
3902
  <h4>Artifacts:</h4>
3903
  <a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
 
101
  :root[data-ui="monocolor"] a { color: var(--mono-color); }
102
  :root[data-ui="monocolor"] .menu-button,
103
  :root[data-ui="monocolor"] .theme-toggle,
104
+ :root[data-ui="monocolor"] .reset-toggle,
105
+ :root[data-ui="monocolor"] .back-button { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); }
106
  :root[data-ui="monocolor"] .menu-button:hover,
107
  :root[data-ui="monocolor"] .theme-toggle:hover,
108
+ :root[data-ui="monocolor"] .reset-toggle:hover,
109
+ :root[data-ui="monocolor"] .back-button:hover { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); }
110
  :root[data-ui="monocolor"] .menu-dropdown { background: var(--bg-primary); border-color: var(--mono-color); box-shadow: none; }
111
  :root[data-ui="monocolor"] .menu-item { color: var(--mono-color); border-bottom-color: var(--mono-color); }
112
  :root[data-ui="monocolor"] .system-info { background: var(--bg-primary); border-color: var(--mono-color); }
 
216
  /* Keep default control styling when widgets are enabled, even in minimal UI */
217
  :root[data-ui="none"][data-widgets="on"] .menu-button,
218
  :root[data-ui="none"][data-widgets="on"] .theme-toggle,
219
+ :root[data-ui="none"][data-widgets="on"] .reset-toggle,
220
+ :root[data-ui="none"][data-widgets="on"] .back-button {
221
  background: #f6f6f6;
222
  border: 1px solid #cccccc;
223
  color: #222222;
 
247
  :root[data-ui="none"][data-widgets="on"] .status-widget { background: #f6f6f6; border-color: #ccc; color: #222; }
248
  :root[data-ui="none"][data-widgets="on"] .menu-button:hover,
249
  :root[data-ui="none"][data-widgets="on"] .theme-toggle:hover,
250
+ :root[data-ui="none"][data-widgets="on"] .reset-toggle:hover,
251
+ :root[data-ui="none"][data-widgets="on"] .back-button:hover {
252
  background: #ededed;
253
  border-color: #bbbbbb;
254
  color: #000000;
 
306
  }
307
 
308
  .theme-toggle,
309
+ .reset-toggle,
310
+ .back-button {
311
  background: var(--bg-secondary);
312
  border: 1px solid var(--border-primary);
313
  padding: 8px 12px;
 
318
  font-size: 0.9rem;
319
  user-select: none;
320
  }
321
+
322
+ .back-button {
323
+ text-decoration: none;
324
+ display: inline-block;
325
+ }
326
+
327
  .theme-toggle:hover,
328
+ .reset-toggle:hover,
329
+ .back-button:hover {
330
  color: var(--text-primary);
331
  background: var(--bg-tertiary);
332
  }
 
3708
  <body>
3709
  <div class="controls">
3710
  <div class="controls-buttons">
3711
+
3712
+ <a href="index.html" class="back-button">← back</a>
3713
+
3714
  <div class="theme-toggle" onclick="toggleTheme()">light</div>
3715
  <div class="reset-toggle" onclick="resetLayout()">reset</div>
3716
  <div class="menu-button" onclick="toggleMenu()">
 
3759
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3760
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3761
  </span> |
3762
+ Cell: benchmark | 38.65s
3763
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3764
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3765
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3845
  </div>
3846
  <div id="output-benchmark" class="cell-output">
3847
  <div class="cell-stdout">impl wl p50(ms) ok
3848
+ hf_kernels_flash_attn flux_L128 0.35 True
3849
  hf_kernels_flash_attn flux_L256 0.38 True
3850
  hf_kernels_flash_attn flux_L320 0.49 True
3851
+ hf_kernels_flash_attn flux_L384 0.52 True
3852
  hf_kernels_flash_attn flux_L448 0.54 True
3853
+ hf_kernels_flash_attn flux_L512 0.56 True
3854
  </div>
3855
  <div class="uv-install-logs" id="uv-logs-benchmark">
3856
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3857
  <div class="uv-logs-content" style="display: none;">
3858
  Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
3859
+ Downloading kiwisolver (1.4MiB)
3860
  Downloading sympy (6.0MiB)
3861
+ Downloading hf-xet (3.0MiB)
3862
+ Downloading nvidia-cusparselt-cu12 (273.9MiB)
3863
  Downloading nvidia-nccl-cu12 (307.4MiB)
 
 
3864
  Downloading networkx (1.9MiB)
3865
+ Downloading pillow (6.3MiB)
3866
+ Downloading nvidia-cufile-cu12 (1.1MiB)
3867
+ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
3868
+ Downloading nvidia-cublas-cu12 (566.8MiB)
3869
  Downloading nvidia-cudnn-cu12 (674.0MiB)
3870
+ Downloading matplotlib (8.3MiB)
3871
  Downloading triton (148.3MiB)
3872
+ Downloading nvidia-curand-cu12 (60.7MiB)
3873
+ Downloading fonttools (4.7MiB)
3874
  Downloading numpy (16.2MiB)
3875
+ Downloading nvidia-cusolver-cu12 (255.1MiB)
3876
+ Downloading torch (846.9MiB)
 
 
3877
  Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3878
+ Downloading setuptools (1.1MiB)
 
 
 
3879
  Downloading nvidia-cufft-cu12 (184.2MiB)
3880
+ Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3881
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
3882
  Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3883
  Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3884
  Downloading nvidia-cufile-cu12
 
3889
  Downloading fonttools
3890
  Downloading pillow
3891
  Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
 
3892
  Downloading nvidia-cuda-cupti-cu12
3893
+ Downloading matplotlib
3894
  Downloading sympy
3895
+ Downloading numpy
3896
  Downloading nvidia-nvjitlink-cu12
3897
  Downloading nvidia-curand-cu12
3898
  Downloading nvidia-cuda-nvrtc-cu12
3899
  Downloading triton
3900
  Downloading nvidia-cufft-cu12
3901
  Downloading nvidia-cusolver-cu12
 
3902
  Downloading nvidia-cusparselt-cu12
3903
+ Downloading nvidia-cusparse-cu12
3904
  Downloading nvidia-nccl-cu12
3905
  Downloading nvidia-cublas-cu12
3906
  Downloading nvidia-cudnn-cu12
3907
  Downloading torch
3908
+ Installed 47 packages in 527ms
3909
  </div>
3910
  </div>
3911
  <div class="cell-stderr">Fetching 20 files: 0%| | 0/20 [00:00&lt;?, ?it/s]
3912
+ Fetching 20 files: 5%|▌ | 1/20 [00:00&lt;00:03, 5.70it/s]
3913
+ Fetching 20 files: 10%|█ | 2/20 [00:01&lt;00:13, 1.36it/s]
3914
+ Fetching 20 files: 100%|██████████| 20/20 [00:01&lt;00:00, 15.31it/s]</div>
3915
  <div class="cell-artifacts">
3916
  <h4>Artifacts:</h4>
3917
  <a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
flash_attn/impls/hf_kernels_flash_attn3.html CHANGED
@@ -101,10 +101,12 @@
101
  :root[data-ui="monocolor"] a { color: var(--mono-color); }
102
  :root[data-ui="monocolor"] .menu-button,
103
  :root[data-ui="monocolor"] .theme-toggle,
104
- :root[data-ui="monocolor"] .reset-toggle { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); }
 
105
  :root[data-ui="monocolor"] .menu-button:hover,
106
  :root[data-ui="monocolor"] .theme-toggle:hover,
107
- :root[data-ui="monocolor"] .reset-toggle:hover { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); }
 
108
  :root[data-ui="monocolor"] .menu-dropdown { background: var(--bg-primary); border-color: var(--mono-color); box-shadow: none; }
109
  :root[data-ui="monocolor"] .menu-item { color: var(--mono-color); border-bottom-color: var(--mono-color); }
110
  :root[data-ui="monocolor"] .system-info { background: var(--bg-primary); border-color: var(--mono-color); }
@@ -214,7 +216,8 @@
214
  /* Keep default control styling when widgets are enabled, even in minimal UI */
215
  :root[data-ui="none"][data-widgets="on"] .menu-button,
216
  :root[data-ui="none"][data-widgets="on"] .theme-toggle,
217
- :root[data-ui="none"][data-widgets="on"] .reset-toggle {
 
218
  background: #f6f6f6;
219
  border: 1px solid #cccccc;
220
  color: #222222;
@@ -244,7 +247,8 @@
244
  :root[data-ui="none"][data-widgets="on"] .status-widget { background: #f6f6f6; border-color: #ccc; color: #222; }
245
  :root[data-ui="none"][data-widgets="on"] .menu-button:hover,
246
  :root[data-ui="none"][data-widgets="on"] .theme-toggle:hover,
247
- :root[data-ui="none"][data-widgets="on"] .reset-toggle:hover {
 
248
  background: #ededed;
249
  border-color: #bbbbbb;
250
  color: #000000;
@@ -302,7 +306,8 @@
302
  }
303
 
304
  .theme-toggle,
305
- .reset-toggle {
 
306
  background: var(--bg-secondary);
307
  border: 1px solid var(--border-primary);
308
  padding: 8px 12px;
@@ -313,9 +318,15 @@
313
  font-size: 0.9rem;
314
  user-select: none;
315
  }
316
-
 
 
 
 
 
317
  .theme-toggle:hover,
318
- .reset-toggle:hover {
 
319
  color: var(--text-primary);
320
  background: var(--bg-tertiary);
321
  }
@@ -3697,6 +3708,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3697
  <body>
3698
  <div class="controls">
3699
  <div class="controls-buttons">
 
 
 
3700
  <div class="theme-toggle" onclick="toggleTheme()">light</div>
3701
  <div class="reset-toggle" onclick="resetLayout()">reset</div>
3702
  <div class="menu-button" onclick="toggleMenu()">
@@ -3745,7 +3759,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3745
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3746
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3747
  </span> |
3748
- Cell: benchmark | 39.41s
3749
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3750
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3751
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3833,7 +3847,7 @@ Cell: benchmark | 39.41s
3833
  hf_kernels_flash_attn3 flux_L128 0.36 True
3834
  hf_kernels_flash_attn3 flux_L256 0.39 True
3835
  hf_kernels_flash_attn3 flux_L320 0.52 True
3836
- hf_kernels_flash_attn3 flux_L384 0.52 True
3837
  hf_kernels_flash_attn3 flux_L448 0.57 True
3838
  hf_kernels_flash_attn3 flux_L512 0.57 True
3839
  </div>
@@ -3841,62 +3855,62 @@ hf_kernels_flash_attn3 flux_L512 0.57 True
3841
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3842
  <div class="uv-logs-content" style="display: none;">
3843
  Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
3844
- Downloading nvidia-cufile-cu12 (1.1MiB)
3845
- Downloading nvidia-nccl-cu12 (307.4MiB)
3846
- Downloading setuptools (1.1MiB)
3847
  Downloading nvidia-curand-cu12 (60.7MiB)
3848
- Downloading pillow (6.3MiB)
3849
- Downloading numpy (16.2MiB)
 
 
3850
  Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3851
- Downloading nvidia-cusparse-cu12 (274.9MiB)
3852
- Downloading nvidia-cufft-cu12 (184.2MiB)
3853
- Downloading networkx (1.9MiB)
3854
- Downloading nvidia-cudnn-cu12 (674.0MiB)
3855
- Downloading sympy (6.0MiB)
3856
  Downloading hf-xet (3.0MiB)
3857
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
3858
  Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3859
  Downloading torch (846.9MiB)
3860
- Downloading triton (148.3MiB)
3861
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
3862
  Downloading nvidia-cusolver-cu12 (255.1MiB)
3863
- Downloading nvidia-cublas-cu12 (566.8MiB)
3864
  Downloading fonttools (4.7MiB)
3865
- Downloading kiwisolver (1.4MiB)
3866
- Downloading matplotlib (8.3MiB)
 
 
 
 
3867
  Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3868
  Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3869
  Downloading nvidia-cufile-cu12
3870
  Downloading kiwisolver
3871
  Downloading hf-xet
3872
  Downloading setuptools
3873
- Downloading networkx
3874
  Downloading fonttools
 
3875
  Downloading pillow
3876
  Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3877
  Downloading nvidia-cuda-cupti-cu12
3878
  Downloading matplotlib
3879
- Downloading sympy
3880
  Downloading numpy
 
3881
  Downloading nvidia-nvjitlink-cu12
3882
  Downloading nvidia-curand-cu12
3883
  Downloading nvidia-cuda-nvrtc-cu12
3884
  Downloading triton
3885
  Downloading nvidia-cufft-cu12
3886
  Downloading nvidia-cusolver-cu12
3887
- Downloading nvidia-cusparse-cu12
3888
  Downloading nvidia-cusparselt-cu12
 
3889
  Downloading nvidia-nccl-cu12
3890
  Downloading nvidia-cublas-cu12
3891
  Downloading nvidia-cudnn-cu12
3892
  Downloading torch
3893
- Installed 47 packages in 529ms
3894
  </div>
3895
  </div>
3896
  <div class="cell-stderr">Fetching 4 files: 0%| | 0/4 [00:00&lt;?, ?it/s]
3897
- Fetching 4 files: 25%|██▌ | 1/4 [00:00&lt;00:00, 6.35it/s]
3898
- Fetching 4 files: 50%|█████ | 2/4 [00:01&lt;00:01, 1.09it/s]
3899
- Fetching 4 files: 100%|██████████| 4/4 [00:01&lt;00:00, 2.48it/s]</div>
3900
  <div class="cell-artifacts">
3901
  <h4>Artifacts:</h4>
3902
  <a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
 
101
  :root[data-ui="monocolor"] a { color: var(--mono-color); }
102
  :root[data-ui="monocolor"] .menu-button,
103
  :root[data-ui="monocolor"] .theme-toggle,
104
+ :root[data-ui="monocolor"] .reset-toggle,
105
+ :root[data-ui="monocolor"] .back-button { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); }
106
  :root[data-ui="monocolor"] .menu-button:hover,
107
  :root[data-ui="monocolor"] .theme-toggle:hover,
108
+ :root[data-ui="monocolor"] .reset-toggle:hover,
109
+ :root[data-ui="monocolor"] .back-button:hover { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); }
110
  :root[data-ui="monocolor"] .menu-dropdown { background: var(--bg-primary); border-color: var(--mono-color); box-shadow: none; }
111
  :root[data-ui="monocolor"] .menu-item { color: var(--mono-color); border-bottom-color: var(--mono-color); }
112
  :root[data-ui="monocolor"] .system-info { background: var(--bg-primary); border-color: var(--mono-color); }
 
216
  /* Keep default control styling when widgets are enabled, even in minimal UI */
217
  :root[data-ui="none"][data-widgets="on"] .menu-button,
218
  :root[data-ui="none"][data-widgets="on"] .theme-toggle,
219
+ :root[data-ui="none"][data-widgets="on"] .reset-toggle,
220
+ :root[data-ui="none"][data-widgets="on"] .back-button {
221
  background: #f6f6f6;
222
  border: 1px solid #cccccc;
223
  color: #222222;
 
247
  :root[data-ui="none"][data-widgets="on"] .status-widget { background: #f6f6f6; border-color: #ccc; color: #222; }
248
  :root[data-ui="none"][data-widgets="on"] .menu-button:hover,
249
  :root[data-ui="none"][data-widgets="on"] .theme-toggle:hover,
250
+ :root[data-ui="none"][data-widgets="on"] .reset-toggle:hover,
251
+ :root[data-ui="none"][data-widgets="on"] .back-button:hover {
252
  background: #ededed;
253
  border-color: #bbbbbb;
254
  color: #000000;
 
306
  }
307
 
308
  .theme-toggle,
309
+ .reset-toggle,
310
+ .back-button {
311
  background: var(--bg-secondary);
312
  border: 1px solid var(--border-primary);
313
  padding: 8px 12px;
 
318
  font-size: 0.9rem;
319
  user-select: none;
320
  }
321
+
322
+ .back-button {
323
+ text-decoration: none;
324
+ display: inline-block;
325
+ }
326
+
327
  .theme-toggle:hover,
328
+ .reset-toggle:hover,
329
+ .back-button:hover {
330
  color: var(--text-primary);
331
  background: var(--bg-tertiary);
332
  }
 
3708
  <body>
3709
  <div class="controls">
3710
  <div class="controls-buttons">
3711
+
3712
+ <a href="index.html" class="back-button">← back</a>
3713
+
3714
  <div class="theme-toggle" onclick="toggleTheme()">light</div>
3715
  <div class="reset-toggle" onclick="resetLayout()">reset</div>
3716
  <div class="menu-button" onclick="toggleMenu()">
 
3759
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3760
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3761
  </span> |
3762
+ Cell: benchmark | 38.16s
3763
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3764
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3765
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3847
  hf_kernels_flash_attn3 flux_L128 0.36 True
3848
  hf_kernels_flash_attn3 flux_L256 0.39 True
3849
  hf_kernels_flash_attn3 flux_L320 0.52 True
3850
+ hf_kernels_flash_attn3 flux_L384 0.53 True
3851
  hf_kernels_flash_attn3 flux_L448 0.57 True
3852
  hf_kernels_flash_attn3 flux_L512 0.57 True
3853
  </div>
 
3855
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3856
  <div class="uv-logs-content" style="display: none;">
3857
  Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
3858
+ Downloading networkx (1.9MiB)
3859
+ Downloading kiwisolver (1.4MiB)
3860
+ Downloading nvidia-cufft-cu12 (184.2MiB)
3861
  Downloading nvidia-curand-cu12 (60.7MiB)
3862
+ Downloading matplotlib (8.3MiB)
3863
+ Downloading setuptools (1.1MiB)
3864
+ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
3865
+ Downloading nvidia-cusparselt-cu12 (273.9MiB)
3866
  Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
 
 
 
 
 
3867
  Downloading hf-xet (3.0MiB)
3868
+ Downloading nvidia-cufile-cu12 (1.1MiB)
3869
  Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3870
  Downloading torch (846.9MiB)
3871
+ Downloading nvidia-nccl-cu12 (307.4MiB)
3872
+ Downloading pillow (6.3MiB)
3873
  Downloading nvidia-cusolver-cu12 (255.1MiB)
 
3874
  Downloading fonttools (4.7MiB)
3875
+ Downloading numpy (16.2MiB)
3876
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
3877
+ Downloading sympy (6.0MiB)
3878
+ Downloading nvidia-cudnn-cu12 (674.0MiB)
3879
+ Downloading nvidia-cublas-cu12 (566.8MiB)
3880
+ Downloading triton (148.3MiB)
3881
  Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3882
  Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3883
  Downloading nvidia-cufile-cu12
3884
  Downloading kiwisolver
3885
  Downloading hf-xet
3886
  Downloading setuptools
 
3887
  Downloading fonttools
3888
+ Downloading networkx
3889
  Downloading pillow
3890
  Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3891
  Downloading nvidia-cuda-cupti-cu12
3892
  Downloading matplotlib
 
3893
  Downloading numpy
3894
+ Downloading sympy
3895
  Downloading nvidia-nvjitlink-cu12
3896
  Downloading nvidia-curand-cu12
3897
  Downloading nvidia-cuda-nvrtc-cu12
3898
  Downloading triton
3899
  Downloading nvidia-cufft-cu12
3900
  Downloading nvidia-cusolver-cu12
 
3901
  Downloading nvidia-cusparselt-cu12
3902
+ Downloading nvidia-cusparse-cu12
3903
  Downloading nvidia-nccl-cu12
3904
  Downloading nvidia-cublas-cu12
3905
  Downloading nvidia-cudnn-cu12
3906
  Downloading torch
3907
+ Installed 47 packages in 565ms
3908
  </div>
3909
  </div>
3910
  <div class="cell-stderr">Fetching 4 files: 0%| | 0/4 [00:00&lt;?, ?it/s]
3911
+ Fetching 4 files: 25%|██▌ | 1/4 [00:00&lt;00:00, 5.17it/s]
3912
+ Fetching 4 files: 50%|█████ | 2/4 [00:01&lt;00:01, 1.22it/s]
3913
+ Fetching 4 files: 100%|██████████| 4/4 [00:01&lt;00:00, 2.76it/s]</div>
3914
  <div class="cell-artifacts">
3915
  <h4>Artifacts:</h4>
3916
  <a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
flash_attn/impls/index.html CHANGED
@@ -2,22 +2,86 @@
2
  <html>
3
  <head>
4
  <meta charset='UTF-8'>
5
- <title>Directory Index</title>
 
6
  <style>
7
- body { font-family: monospace; margin: 20px; }
8
- h1 { font-size: 1.5em; }
9
- ul { list-style-type: none; padding-left: 20px; }
10
- li { margin: 5px 0; }
11
- .dir { font-weight: bold; }
12
- .file { color: #0066cc; }
13
- a { text-decoration: none; }
14
- a:hover { text-decoration: underline; }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  </style>
16
  </head>
17
  <body>
 
 
 
18
  <h1>Index of /flash_attn/impls</h1>
19
  <ul>
20
- <li><a href='../index.html' class='dir'>../</a></li>
21
  <li><a href='compiled_variants.html' class='file'>compiled_variants.html</a></li>
22
  <li><a href='flash_attention.html' class='file'>flash_attention.html</a></li>
23
  <li><a href='hf_kernels_flash_attn.html' class='file'>hf_kernels_flash_attn.html</a></li>
 
2
  <html>
3
  <head>
4
  <meta charset='UTF-8'>
5
+ <meta name='viewport' content='width=device-width, initial-scale=1.0'>
6
+ <title>Index of /flash_attn/impls</title>
7
  <style>
8
+ :root {
9
+ --bg-primary: #0a0a0a;
10
+ --bg-secondary: #121212;
11
+ --bg-tertiary: #181818;
12
+ --text-primary: #e0e0e0;
13
+ --text-secondary: #888888;
14
+ --text-link: #64b5f6;
15
+ --border-primary: #2a2a2a;
16
+ }
17
+ body {
18
+ font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
19
+ background: var(--bg-primary);
20
+ color: var(--text-primary);
21
+ margin: 0;
22
+ padding: 16px;
23
+ max-width: 900px;
24
+ margin: 0 auto;
25
+ }
26
+ .controls {
27
+ display: flex;
28
+ justify-content: flex-end;
29
+ margin-bottom: 1rem;
30
+ }
31
+ .back-button {
32
+ background: var(--bg-secondary);
33
+ border: 1px solid var(--border-primary);
34
+ padding: 8px 12px;
35
+ border-radius: 4px;
36
+ color: var(--text-secondary);
37
+ cursor: pointer;
38
+ font-size: 0.9rem;
39
+ text-decoration: none;
40
+ display: inline-block;
41
+ }
42
+ .back-button:hover {
43
+ color: var(--text-primary);
44
+ background: var(--bg-tertiary);
45
+ }
46
+ h1 {
47
+ font-size: 1.5em;
48
+ margin: 1rem 0;
49
+ color: var(--text-primary);
50
+ border-bottom: 1px solid var(--border-primary);
51
+ padding-bottom: 0.5rem;
52
+ }
53
+ ul {
54
+ list-style-type: none;
55
+ padding: 0;
56
+ }
57
+ li {
58
+ margin: 0;
59
+ border-bottom: 1px solid var(--border-primary);
60
+ }
61
+ li:last-child {
62
+ border-bottom: none;
63
+ }
64
+ a {
65
+ display: block;
66
+ padding: 0.75rem 0.5rem;
67
+ text-decoration: none;
68
+ color: var(--text-link);
69
+ transition: background 0.2s ease;
70
+ }
71
+ a:hover {
72
+ background: var(--bg-secondary);
73
+ }
74
+ .dir {
75
+ font-weight: 500;
76
+ }
77
  </style>
78
  </head>
79
  <body>
80
+ <div class='controls'>
81
+ <a href='../index.html' class='back-button'>← back</a>
82
+ </div>
83
  <h1>Index of /flash_attn/impls</h1>
84
  <ul>
 
85
  <li><a href='compiled_variants.html' class='file'>compiled_variants.html</a></li>
86
  <li><a href='flash_attention.html' class='file'>flash_attention.html</a></li>
87
  <li><a href='hf_kernels_flash_attn.html' class='file'>hf_kernels_flash_attn.html</a></li>
flash_attn/impls/mem_efficient_attention.html CHANGED
@@ -101,10 +101,12 @@
101
  :root[data-ui="monocolor"] a { color: var(--mono-color); }
102
  :root[data-ui="monocolor"] .menu-button,
103
  :root[data-ui="monocolor"] .theme-toggle,
104
- :root[data-ui="monocolor"] .reset-toggle { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); }
 
105
  :root[data-ui="monocolor"] .menu-button:hover,
106
  :root[data-ui="monocolor"] .theme-toggle:hover,
107
- :root[data-ui="monocolor"] .reset-toggle:hover { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); }
 
108
  :root[data-ui="monocolor"] .menu-dropdown { background: var(--bg-primary); border-color: var(--mono-color); box-shadow: none; }
109
  :root[data-ui="monocolor"] .menu-item { color: var(--mono-color); border-bottom-color: var(--mono-color); }
110
  :root[data-ui="monocolor"] .system-info { background: var(--bg-primary); border-color: var(--mono-color); }
@@ -214,7 +216,8 @@
214
  /* Keep default control styling when widgets are enabled, even in minimal UI */
215
  :root[data-ui="none"][data-widgets="on"] .menu-button,
216
  :root[data-ui="none"][data-widgets="on"] .theme-toggle,
217
- :root[data-ui="none"][data-widgets="on"] .reset-toggle {
 
218
  background: #f6f6f6;
219
  border: 1px solid #cccccc;
220
  color: #222222;
@@ -244,7 +247,8 @@
244
  :root[data-ui="none"][data-widgets="on"] .status-widget { background: #f6f6f6; border-color: #ccc; color: #222; }
245
  :root[data-ui="none"][data-widgets="on"] .menu-button:hover,
246
  :root[data-ui="none"][data-widgets="on"] .theme-toggle:hover,
247
- :root[data-ui="none"][data-widgets="on"] .reset-toggle:hover {
 
248
  background: #ededed;
249
  border-color: #bbbbbb;
250
  color: #000000;
@@ -302,7 +306,8 @@
302
  }
303
 
304
  .theme-toggle,
305
- .reset-toggle {
 
306
  background: var(--bg-secondary);
307
  border: 1px solid var(--border-primary);
308
  padding: 8px 12px;
@@ -313,9 +318,15 @@
313
  font-size: 0.9rem;
314
  user-select: none;
315
  }
316
-
 
 
 
 
 
317
  .theme-toggle:hover,
318
- .reset-toggle:hover {
 
319
  color: var(--text-primary);
320
  background: var(--bg-tertiary);
321
  }
@@ -3697,6 +3708,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3697
  <body>
3698
  <div class="controls">
3699
  <div class="controls-buttons">
 
 
 
3700
  <div class="theme-toggle" onclick="toggleTheme()">light</div>
3701
  <div class="reset-toggle" onclick="resetLayout()">reset</div>
3702
  <div class="menu-button" onclick="toggleMenu()">
@@ -3745,7 +3759,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3745
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3746
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3747
  </span> |
3748
- Cell: benchmark | 36.09s
3749
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3750
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3751
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3838,28 +3852,28 @@ torch_mem_eff flux_L512 0.95 True
3838
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3839
  <div class="uv-logs-content" style="display: none;">
3840
  Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
 
 
3841
  Downloading sympy (6.0MiB)
3842
- Downloading setuptools (1.1MiB)
 
3843
  Downloading nvidia-cublas-cu12 (566.8MiB)
 
3844
  Downloading kiwisolver (1.4MiB)
3845
- Downloading nvidia-nccl-cu12 (307.4MiB)
3846
- Downloading torch (846.9MiB)
3847
- Downloading matplotlib (8.3MiB)
3848
  Downloading nvidia-cusparse-cu12 (274.9MiB)
3849
- Downloading pillow (6.3MiB)
3850
- Downloading nvidia-cusolver-cu12 (255.1MiB)
3851
- Downloading networkx (1.9MiB)
3852
- Downloading numpy (16.2MiB)
3853
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
3854
  Downloading nvidia-cufft-cu12 (184.2MiB)
3855
  Downloading nvidia-cusparselt-cu12 (273.9MiB)
3856
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
 
3857
  Downloading nvidia-curand-cu12 (60.7MiB)
3858
- Downloading nvidia-cufile-cu12 (1.1MiB)
 
3859
  Downloading fonttools (4.7MiB)
3860
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3861
- Downloading nvidia-cudnn-cu12 (674.0MiB)
3862
- Downloading triton (148.3MiB)
 
3863
  Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3864
  Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3865
  Downloading nvidia-cufile-cu12
@@ -3885,7 +3899,7 @@ Downloading triton (148.3MiB)
3885
  Downloading nvidia-cublas-cu12
3886
  Downloading nvidia-cudnn-cu12
3887
  Downloading torch
3888
- Installed 37 packages in 447ms
3889
  </div>
3890
  </div>
3891
  <div class="cell-artifacts">
 
101
  :root[data-ui="monocolor"] a { color: var(--mono-color); }
102
  :root[data-ui="monocolor"] .menu-button,
103
  :root[data-ui="monocolor"] .theme-toggle,
104
+ :root[data-ui="monocolor"] .reset-toggle,
105
+ :root[data-ui="monocolor"] .back-button { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); }
106
  :root[data-ui="monocolor"] .menu-button:hover,
107
  :root[data-ui="monocolor"] .theme-toggle:hover,
108
+ :root[data-ui="monocolor"] .reset-toggle:hover,
109
+ :root[data-ui="monocolor"] .back-button:hover { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); }
110
  :root[data-ui="monocolor"] .menu-dropdown { background: var(--bg-primary); border-color: var(--mono-color); box-shadow: none; }
111
  :root[data-ui="monocolor"] .menu-item { color: var(--mono-color); border-bottom-color: var(--mono-color); }
112
  :root[data-ui="monocolor"] .system-info { background: var(--bg-primary); border-color: var(--mono-color); }
 
216
  /* Keep default control styling when widgets are enabled, even in minimal UI */
217
  :root[data-ui="none"][data-widgets="on"] .menu-button,
218
  :root[data-ui="none"][data-widgets="on"] .theme-toggle,
219
+ :root[data-ui="none"][data-widgets="on"] .reset-toggle,
220
+ :root[data-ui="none"][data-widgets="on"] .back-button {
221
  background: #f6f6f6;
222
  border: 1px solid #cccccc;
223
  color: #222222;
 
247
  :root[data-ui="none"][data-widgets="on"] .status-widget { background: #f6f6f6; border-color: #ccc; color: #222; }
248
  :root[data-ui="none"][data-widgets="on"] .menu-button:hover,
249
  :root[data-ui="none"][data-widgets="on"] .theme-toggle:hover,
250
+ :root[data-ui="none"][data-widgets="on"] .reset-toggle:hover,
251
+ :root[data-ui="none"][data-widgets="on"] .back-button:hover {
252
  background: #ededed;
253
  border-color: #bbbbbb;
254
  color: #000000;
 
306
  }
307
 
308
  .theme-toggle,
309
+ .reset-toggle,
310
+ .back-button {
311
  background: var(--bg-secondary);
312
  border: 1px solid var(--border-primary);
313
  padding: 8px 12px;
 
318
  font-size: 0.9rem;
319
  user-select: none;
320
  }
321
+
322
+ .back-button {
323
+ text-decoration: none;
324
+ display: inline-block;
325
+ }
326
+
327
  .theme-toggle:hover,
328
+ .reset-toggle:hover,
329
+ .back-button:hover {
330
  color: var(--text-primary);
331
  background: var(--bg-tertiary);
332
  }
 
3708
  <body>
3709
  <div class="controls">
3710
  <div class="controls-buttons">
3711
+
3712
+ <a href="index.html" class="back-button">← back</a>
3713
+
3714
  <div class="theme-toggle" onclick="toggleTheme()">light</div>
3715
  <div class="reset-toggle" onclick="resetLayout()">reset</div>
3716
  <div class="menu-button" onclick="toggleMenu()">
 
3759
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3760
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3761
  </span> |
3762
+ Cell: benchmark | 36.80s
3763
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3764
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3765
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3852
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3853
  <div class="uv-logs-content" style="display: none;">
3854
  Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
3855
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3856
+ Downloading numpy (16.2MiB)
3857
  Downloading sympy (6.0MiB)
3858
+ Downloading networkx (1.9MiB)
3859
+ Downloading nvidia-cufile-cu12 (1.1MiB)
3860
  Downloading nvidia-cublas-cu12 (566.8MiB)
3861
+ Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3862
  Downloading kiwisolver (1.4MiB)
 
 
 
3863
  Downloading nvidia-cusparse-cu12 (274.9MiB)
3864
+ Downloading nvidia-cudnn-cu12 (674.0MiB)
 
 
 
 
3865
  Downloading nvidia-cufft-cu12 (184.2MiB)
3866
  Downloading nvidia-cusparselt-cu12 (273.9MiB)
3867
+ Downloading triton (148.3MiB)
3868
+ Downloading setuptools (1.1MiB)
3869
  Downloading nvidia-curand-cu12 (60.7MiB)
3870
+ Downloading nvidia-cusolver-cu12 (255.1MiB)
3871
+ Downloading pillow (6.3MiB)
3872
  Downloading fonttools (4.7MiB)
3873
+ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
3874
+ Downloading torch (846.9MiB)
3875
+ Downloading matplotlib (8.3MiB)
3876
+ Downloading nvidia-nccl-cu12 (307.4MiB)
3877
  Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3878
  Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3879
  Downloading nvidia-cufile-cu12
 
3899
  Downloading nvidia-cublas-cu12
3900
  Downloading nvidia-cudnn-cu12
3901
  Downloading torch
3902
+ Installed 37 packages in 448ms
3903
  </div>
3904
  </div>
3905
  <div class="cell-artifacts">
flash_attn/impls/sage_attention.html CHANGED
@@ -101,10 +101,12 @@
101
  :root[data-ui="monocolor"] a { color: var(--mono-color); }
102
  :root[data-ui="monocolor"] .menu-button,
103
  :root[data-ui="monocolor"] .theme-toggle,
104
- :root[data-ui="monocolor"] .reset-toggle { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); }
 
105
  :root[data-ui="monocolor"] .menu-button:hover,
106
  :root[data-ui="monocolor"] .theme-toggle:hover,
107
- :root[data-ui="monocolor"] .reset-toggle:hover { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); }
 
108
  :root[data-ui="monocolor"] .menu-dropdown { background: var(--bg-primary); border-color: var(--mono-color); box-shadow: none; }
109
  :root[data-ui="monocolor"] .menu-item { color: var(--mono-color); border-bottom-color: var(--mono-color); }
110
  :root[data-ui="monocolor"] .system-info { background: var(--bg-primary); border-color: var(--mono-color); }
@@ -214,7 +216,8 @@
214
  /* Keep default control styling when widgets are enabled, even in minimal UI */
215
  :root[data-ui="none"][data-widgets="on"] .menu-button,
216
  :root[data-ui="none"][data-widgets="on"] .theme-toggle,
217
- :root[data-ui="none"][data-widgets="on"] .reset-toggle {
 
218
  background: #f6f6f6;
219
  border: 1px solid #cccccc;
220
  color: #222222;
@@ -244,7 +247,8 @@
244
  :root[data-ui="none"][data-widgets="on"] .status-widget { background: #f6f6f6; border-color: #ccc; color: #222; }
245
  :root[data-ui="none"][data-widgets="on"] .menu-button:hover,
246
  :root[data-ui="none"][data-widgets="on"] .theme-toggle:hover,
247
- :root[data-ui="none"][data-widgets="on"] .reset-toggle:hover {
 
248
  background: #ededed;
249
  border-color: #bbbbbb;
250
  color: #000000;
@@ -302,7 +306,8 @@
302
  }
303
 
304
  .theme-toggle,
305
- .reset-toggle {
 
306
  background: var(--bg-secondary);
307
  border: 1px solid var(--border-primary);
308
  padding: 8px 12px;
@@ -313,9 +318,15 @@
313
  font-size: 0.9rem;
314
  user-select: none;
315
  }
316
-
 
 
 
 
 
317
  .theme-toggle:hover,
318
- .reset-toggle:hover {
 
319
  color: var(--text-primary);
320
  background: var(--bg-tertiary);
321
  }
@@ -3697,6 +3708,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3697
  <body>
3698
  <div class="controls">
3699
  <div class="controls-buttons">
 
 
 
3700
  <div class="theme-toggle" onclick="toggleTheme()">light</div>
3701
  <div class="reset-toggle" onclick="resetLayout()">reset</div>
3702
  <div class="menu-button" onclick="toggleMenu()">
@@ -3745,7 +3759,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3745
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3746
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3747
  </span> |
3748
- Cell: benchmark | 40.08s
3749
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3750
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3751
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3839,45 +3853,45 @@ Cell: benchmark | 40.08s
3839
  <div id="output-benchmark" class="cell-output">
3840
  <div class="cell-stdout">impl wl p50(ms) ok
3841
  sage_int8_fp16 flux_L128 FAIL False
3842
- Error: module &#x27;sage_attention_ba12545b014364be&#x27; has no attribute &#x27;fwd&#x27;
3843
  sage_int8_fp16 flux_L256 FAIL False
3844
- Error: module &#x27;sage_attention_ba12545b014364be&#x27; has no attribute &#x27;fwd&#x27;
3845
  sage_int8_fp16 flux_L320 FAIL False
3846
- Error: module &#x27;sage_attention_ba12545b014364be&#x27; has no attribute &#x27;fwd&#x27;
3847
  sage_int8_fp16 flux_L384 FAIL False
3848
- Error: module &#x27;sage_attention_ba12545b014364be&#x27; has no attribute &#x27;fwd&#x27;
3849
  sage_int8_fp16 flux_L448 FAIL False
3850
- Error: module &#x27;sage_attention_ba12545b014364be&#x27; has no attribute &#x27;fwd&#x27;
3851
  sage_int8_fp16 flux_L512 FAIL False
3852
- Error: module &#x27;sage_attention_ba12545b014364be&#x27; has no attribute &#x27;fwd&#x27;
3853
  </div>
3854
  <div class="uv-install-logs" id="uv-logs-benchmark">
3855
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3856
  <div class="uv-logs-content" style="display: none;">
3857
  Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
3858
- Downloading sympy (6.0MiB)
3859
- Downloading nvidia-cusolver-cu12 (255.1MiB)
3860
- Downloading nvidia-cufft-cu12 (184.2MiB)
3861
- Downloading numpy (16.2MiB)
3862
  Downloading nvidia-cufile-cu12 (1.1MiB)
3863
- Downloading networkx (1.9MiB)
3864
- Downloading hf-xet (3.0MiB)
3865
- Downloading nvidia-cublas-cu12 (566.8MiB)
3866
  Downloading nvidia-nccl-cu12 (307.4MiB)
3867
- Downloading pillow (6.3MiB)
3868
  Downloading nvidia-cusparse-cu12 (274.9MiB)
3869
- Downloading triton (148.3MiB)
 
 
 
 
 
3870
  Downloading setuptools (1.1MiB)
3871
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
 
 
 
3872
  Downloading nvidia-cudnn-cu12 (674.0MiB)
3873
  Downloading nvidia-nvjitlink-cu12 (37.4MiB)
3874
- Downloading matplotlib (8.3MiB)
 
 
3875
  Downloading nvidia-curand-cu12 (60.7MiB)
3876
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3877
- Downloading kiwisolver (1.4MiB)
3878
- Downloading fonttools (4.7MiB)
3879
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
3880
  Downloading torch (846.9MiB)
 
 
3881
  Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3882
  Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3883
  Downloading nvidia-cufile-cu12
@@ -3888,8 +3902,8 @@ Downloading torch (846.9MiB)
3888
  Downloading networkx
3889
  Downloading pillow
3890
  Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3891
- Downloading nvidia-cuda-cupti-cu12
3892
  Downloading matplotlib
 
3893
  Downloading numpy
3894
  Downloading sympy
3895
  Downloading nvidia-nvjitlink-cu12
@@ -3898,20 +3912,19 @@ Downloading torch (846.9MiB)
3898
  Downloading triton
3899
  Downloading nvidia-cufft-cu12
3900
  Downloading nvidia-cusolver-cu12
3901
- Downloading nvidia-cusparselt-cu12
3902
  Downloading nvidia-cusparse-cu12
 
3903
  Downloading nvidia-nccl-cu12
3904
  Downloading nvidia-cublas-cu12
3905
  Downloading nvidia-cudnn-cu12
3906
  Downloading torch
3907
- Installed 48 packages in 531ms
3908
  </div>
3909
  </div>
3910
  <div class="cell-stderr">Fetching 11 files: 0%| | 0/11 [00:00&lt;?, ?it/s]
3911
- Fetching 11 files: 9%|▉ | 1/11 [00:00&lt;00:02, 4.42it/s]
3912
- Fetching 11 files: 27%|██▋ | 3/11 [00:00&lt;00:01, 5.95it/s]
3913
- Fetching 11 files: 73%|███████▎ | 8/11 [00:00&lt;00:00, 12.27it/s]
3914
- Fetching 11 files: 100%|██████████| 11/11 [00:00&lt;00:00, 14.37it/s]</div>
3915
  <div class="cell-artifacts">
3916
  <h4>Artifacts:</h4>
3917
  <a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
 
101
  :root[data-ui="monocolor"] a { color: var(--mono-color); }
102
  :root[data-ui="monocolor"] .menu-button,
103
  :root[data-ui="monocolor"] .theme-toggle,
104
+ :root[data-ui="monocolor"] .reset-toggle,
105
+ :root[data-ui="monocolor"] .back-button { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); }
106
  :root[data-ui="monocolor"] .menu-button:hover,
107
  :root[data-ui="monocolor"] .theme-toggle:hover,
108
+ :root[data-ui="monocolor"] .reset-toggle:hover,
109
+ :root[data-ui="monocolor"] .back-button:hover { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); }
110
  :root[data-ui="monocolor"] .menu-dropdown { background: var(--bg-primary); border-color: var(--mono-color); box-shadow: none; }
111
  :root[data-ui="monocolor"] .menu-item { color: var(--mono-color); border-bottom-color: var(--mono-color); }
112
  :root[data-ui="monocolor"] .system-info { background: var(--bg-primary); border-color: var(--mono-color); }
 
216
  /* Keep default control styling when widgets are enabled, even in minimal UI */
217
  :root[data-ui="none"][data-widgets="on"] .menu-button,
218
  :root[data-ui="none"][data-widgets="on"] .theme-toggle,
219
+ :root[data-ui="none"][data-widgets="on"] .reset-toggle,
220
+ :root[data-ui="none"][data-widgets="on"] .back-button {
221
  background: #f6f6f6;
222
  border: 1px solid #cccccc;
223
  color: #222222;
 
247
  :root[data-ui="none"][data-widgets="on"] .status-widget { background: #f6f6f6; border-color: #ccc; color: #222; }
248
  :root[data-ui="none"][data-widgets="on"] .menu-button:hover,
249
  :root[data-ui="none"][data-widgets="on"] .theme-toggle:hover,
250
+ :root[data-ui="none"][data-widgets="on"] .reset-toggle:hover,
251
+ :root[data-ui="none"][data-widgets="on"] .back-button:hover {
252
  background: #ededed;
253
  border-color: #bbbbbb;
254
  color: #000000;
 
306
  }
307
 
308
  .theme-toggle,
309
+ .reset-toggle,
310
+ .back-button {
311
  background: var(--bg-secondary);
312
  border: 1px solid var(--border-primary);
313
  padding: 8px 12px;
 
318
  font-size: 0.9rem;
319
  user-select: none;
320
  }
321
+
322
+ .back-button {
323
+ text-decoration: none;
324
+ display: inline-block;
325
+ }
326
+
327
  .theme-toggle:hover,
328
+ .reset-toggle:hover,
329
+ .back-button:hover {
330
  color: var(--text-primary);
331
  background: var(--bg-tertiary);
332
  }
 
3708
  <body>
3709
  <div class="controls">
3710
  <div class="controls-buttons">
3711
+
3712
+ <a href="index.html" class="back-button">← back</a>
3713
+
3714
  <div class="theme-toggle" onclick="toggleTheme()">light</div>
3715
  <div class="reset-toggle" onclick="resetLayout()">reset</div>
3716
  <div class="menu-button" onclick="toggleMenu()">
 
3759
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3760
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3761
  </span> |
3762
+ Cell: benchmark | 40.58s
3763
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3764
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3765
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3853
  <div id="output-benchmark" class="cell-output">
3854
  <div class="cell-stdout">impl wl p50(ms) ok
3855
  sage_int8_fp16 flux_L128 FAIL False
3856
+ Error: module &#x27;sage_attention_1863f4c92418f0f6&#x27; has no attribute &#x27;fwd&#x27;
3857
  sage_int8_fp16 flux_L256 FAIL False
3858
+ Error: module &#x27;sage_attention_1863f4c92418f0f6&#x27; has no attribute &#x27;fwd&#x27;
3859
  sage_int8_fp16 flux_L320 FAIL False
3860
+ Error: module &#x27;sage_attention_1863f4c92418f0f6&#x27; has no attribute &#x27;fwd&#x27;
3861
  sage_int8_fp16 flux_L384 FAIL False
3862
+ Error: module &#x27;sage_attention_1863f4c92418f0f6&#x27; has no attribute &#x27;fwd&#x27;
3863
  sage_int8_fp16 flux_L448 FAIL False
3864
+ Error: module &#x27;sage_attention_1863f4c92418f0f6&#x27; has no attribute &#x27;fwd&#x27;
3865
  sage_int8_fp16 flux_L512 FAIL False
3866
+ Error: module &#x27;sage_attention_1863f4c92418f0f6&#x27; has no attribute &#x27;fwd&#x27;
3867
  </div>
3868
  <div class="uv-install-logs" id="uv-logs-benchmark">
3869
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3870
  <div class="uv-logs-content" style="display: none;">
3871
  Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
 
 
 
 
3872
  Downloading nvidia-cufile-cu12 (1.1MiB)
 
 
 
3873
  Downloading nvidia-nccl-cu12 (307.4MiB)
 
3874
  Downloading nvidia-cusparse-cu12 (274.9MiB)
3875
+ Downloading pillow (6.3MiB)
3876
+ Downloading nvidia-cusparselt-cu12 (273.9MiB)
3877
+ Downloading hf-xet (3.0MiB)
3878
+ Downloading networkx (1.9MiB)
3879
+ Downloading numpy (16.2MiB)
3880
+ Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3881
  Downloading setuptools (1.1MiB)
3882
+ Downloading kiwisolver (1.4MiB)
3883
+ Downloading matplotlib (8.3MiB)
3884
+ Downloading nvidia-cublas-cu12 (566.8MiB)
3885
+ Downloading fonttools (4.7MiB)
3886
  Downloading nvidia-cudnn-cu12 (674.0MiB)
3887
  Downloading nvidia-nvjitlink-cu12 (37.4MiB)
3888
+ Downloading triton (148.3MiB)
3889
+ Downloading sympy (6.0MiB)
3890
+ Downloading nvidia-cufft-cu12 (184.2MiB)
3891
  Downloading nvidia-curand-cu12 (60.7MiB)
 
 
 
 
3892
  Downloading torch (846.9MiB)
3893
+ Downloading nvidia-cusolver-cu12 (255.1MiB)
3894
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3895
  Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3896
  Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3897
  Downloading nvidia-cufile-cu12
 
3902
  Downloading networkx
3903
  Downloading pillow
3904
  Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
 
3905
  Downloading matplotlib
3906
+ Downloading nvidia-cuda-cupti-cu12
3907
  Downloading numpy
3908
  Downloading sympy
3909
  Downloading nvidia-nvjitlink-cu12
 
3912
  Downloading triton
3913
  Downloading nvidia-cufft-cu12
3914
  Downloading nvidia-cusolver-cu12
 
3915
  Downloading nvidia-cusparse-cu12
3916
+ Downloading nvidia-cusparselt-cu12
3917
  Downloading nvidia-nccl-cu12
3918
  Downloading nvidia-cublas-cu12
3919
  Downloading nvidia-cudnn-cu12
3920
  Downloading torch
3921
+ Installed 48 packages in 591ms
3922
  </div>
3923
  </div>
3924
  <div class="cell-stderr">Fetching 11 files: 0%| | 0/11 [00:00&lt;?, ?it/s]
3925
+ Fetching 11 files: 9%|▉ | 1/11 [00:00&lt;00:01, 5.59it/s]
3926
+ Fetching 11 files: 73%|███████▎ | 8/11 [00:00&lt;00:00, 12.79it/s]
3927
+ Fetching 11 files: 100%|██████████| 11/11 [00:00&lt;00:00, 16.77it/s]</div>
 
3928
  <div class="cell-artifacts">
3929
  <h4>Artifacts:</h4>
3930
  <a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
flash_attn/impls/xformers.html CHANGED
@@ -101,10 +101,12 @@
101
  :root[data-ui="monocolor"] a { color: var(--mono-color); }
102
  :root[data-ui="monocolor"] .menu-button,
103
  :root[data-ui="monocolor"] .theme-toggle,
104
- :root[data-ui="monocolor"] .reset-toggle { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); }
 
105
  :root[data-ui="monocolor"] .menu-button:hover,
106
  :root[data-ui="monocolor"] .theme-toggle:hover,
107
- :root[data-ui="monocolor"] .reset-toggle:hover { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); }
 
108
  :root[data-ui="monocolor"] .menu-dropdown { background: var(--bg-primary); border-color: var(--mono-color); box-shadow: none; }
109
  :root[data-ui="monocolor"] .menu-item { color: var(--mono-color); border-bottom-color: var(--mono-color); }
110
  :root[data-ui="monocolor"] .system-info { background: var(--bg-primary); border-color: var(--mono-color); }
@@ -214,7 +216,8 @@
214
  /* Keep default control styling when widgets are enabled, even in minimal UI */
215
  :root[data-ui="none"][data-widgets="on"] .menu-button,
216
  :root[data-ui="none"][data-widgets="on"] .theme-toggle,
217
- :root[data-ui="none"][data-widgets="on"] .reset-toggle {
 
218
  background: #f6f6f6;
219
  border: 1px solid #cccccc;
220
  color: #222222;
@@ -244,7 +247,8 @@
244
  :root[data-ui="none"][data-widgets="on"] .status-widget { background: #f6f6f6; border-color: #ccc; color: #222; }
245
  :root[data-ui="none"][data-widgets="on"] .menu-button:hover,
246
  :root[data-ui="none"][data-widgets="on"] .theme-toggle:hover,
247
- :root[data-ui="none"][data-widgets="on"] .reset-toggle:hover {
 
248
  background: #ededed;
249
  border-color: #bbbbbb;
250
  color: #000000;
@@ -302,7 +306,8 @@
302
  }
303
 
304
  .theme-toggle,
305
- .reset-toggle {
 
306
  background: var(--bg-secondary);
307
  border: 1px solid var(--border-primary);
308
  padding: 8px 12px;
@@ -313,9 +318,15 @@
313
  font-size: 0.9rem;
314
  user-select: none;
315
  }
316
-
 
 
 
 
 
317
  .theme-toggle:hover,
318
- .reset-toggle:hover {
 
319
  color: var(--text-primary);
320
  background: var(--bg-tertiary);
321
  }
@@ -3697,6 +3708,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3697
  <body>
3698
  <div class="controls">
3699
  <div class="controls-buttons">
 
 
 
3700
  <div class="theme-toggle" onclick="toggleTheme()">light</div>
3701
  <div class="reset-toggle" onclick="resetLayout()">reset</div>
3702
  <div class="menu-button" onclick="toggleMenu()">
@@ -3745,7 +3759,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3745
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3746
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3747
  </span> |
3748
- Cell: benchmark | 40.41s
3749
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3750
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3751
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3838,38 +3852,38 @@ xformers_meff flux_L512 0.65 True
3838
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3839
  <div class="uv-logs-content" style="display: none;">
3840
  Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
3841
- Downloading nvidia-cusparse-cu12 (274.9MiB)
3842
- Downloading pillow (6.3MiB)
3843
- Downloading nvidia-cusolver-cu12 (255.1MiB)
3844
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
3845
- Downloading nvidia-cublas-cu12 (566.8MiB)
3846
  Downloading nvidia-curand-cu12 (60.7MiB)
 
3847
  Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3848
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
3849
  Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3850
- Downloading numpy (16.2MiB)
3851
- Downloading nvidia-nccl-cu12 (307.4MiB)
3852
  Downloading setuptools (1.1MiB)
3853
- Downloading nvidia-cudnn-cu12 (674.0MiB)
3854
- Downloading nvidia-cufile-cu12 (1.1MiB)
3855
- Downloading networkx (1.9MiB)
3856
- Downloading kiwisolver (1.4MiB)
3857
- Downloading nvidia-cufft-cu12 (184.2MiB)
3858
  Downloading torch (846.9MiB)
3859
  Downloading matplotlib (8.3MiB)
3860
- Downloading triton (148.3MiB)
3861
- Downloading sympy (6.0MiB)
3862
  Downloading fonttools (4.7MiB)
 
 
 
 
 
3863
  Downloading xformers (111.8MiB)
 
 
 
 
 
3864
  Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3865
  Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3866
  Downloading nvidia-cufile-cu12
3867
  Downloading kiwisolver
3868
  Downloading setuptools
3869
- Downloading fonttools
3870
  Downloading networkx
3871
- Downloading pillow
3872
  Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
 
3873
  Downloading nvidia-cuda-cupti-cu12
3874
  Downloading matplotlib
3875
  Downloading sympy
@@ -3884,10 +3898,10 @@ Downloading xformers (111.8MiB)
3884
  Downloading nvidia-cusparselt-cu12
3885
  Downloading nvidia-cusparse-cu12
3886
  Downloading nvidia-nccl-cu12
3887
- Downloading nvidia-cublas-cu12
3888
  Downloading nvidia-cudnn-cu12
 
3889
  Downloading torch
3890
- Installed 38 packages in 452ms
3891
  </div>
3892
  </div>
3893
  <div class="cell-artifacts">
 
101
  :root[data-ui="monocolor"] a { color: var(--mono-color); }
102
  :root[data-ui="monocolor"] .menu-button,
103
  :root[data-ui="monocolor"] .theme-toggle,
104
+ :root[data-ui="monocolor"] .reset-toggle,
105
+ :root[data-ui="monocolor"] .back-button { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); }
106
  :root[data-ui="monocolor"] .menu-button:hover,
107
  :root[data-ui="monocolor"] .theme-toggle:hover,
108
+ :root[data-ui="monocolor"] .reset-toggle:hover,
109
+ :root[data-ui="monocolor"] .back-button:hover { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); }
110
  :root[data-ui="monocolor"] .menu-dropdown { background: var(--bg-primary); border-color: var(--mono-color); box-shadow: none; }
111
  :root[data-ui="monocolor"] .menu-item { color: var(--mono-color); border-bottom-color: var(--mono-color); }
112
  :root[data-ui="monocolor"] .system-info { background: var(--bg-primary); border-color: var(--mono-color); }
 
216
  /* Keep default control styling when widgets are enabled, even in minimal UI */
217
  :root[data-ui="none"][data-widgets="on"] .menu-button,
218
  :root[data-ui="none"][data-widgets="on"] .theme-toggle,
219
+ :root[data-ui="none"][data-widgets="on"] .reset-toggle,
220
+ :root[data-ui="none"][data-widgets="on"] .back-button {
221
  background: #f6f6f6;
222
  border: 1px solid #cccccc;
223
  color: #222222;
 
247
  :root[data-ui="none"][data-widgets="on"] .status-widget { background: #f6f6f6; border-color: #ccc; color: #222; }
248
  :root[data-ui="none"][data-widgets="on"] .menu-button:hover,
249
  :root[data-ui="none"][data-widgets="on"] .theme-toggle:hover,
250
+ :root[data-ui="none"][data-widgets="on"] .reset-toggle:hover,
251
+ :root[data-ui="none"][data-widgets="on"] .back-button:hover {
252
  background: #ededed;
253
  border-color: #bbbbbb;
254
  color: #000000;
 
306
  }
307
 
308
  .theme-toggle,
309
+ .reset-toggle,
310
+ .back-button {
311
  background: var(--bg-secondary);
312
  border: 1px solid var(--border-primary);
313
  padding: 8px 12px;
 
318
  font-size: 0.9rem;
319
  user-select: none;
320
  }
321
+
322
+ .back-button {
323
+ text-decoration: none;
324
+ display: inline-block;
325
+ }
326
+
327
  .theme-toggle:hover,
328
+ .reset-toggle:hover,
329
+ .back-button:hover {
330
  color: var(--text-primary);
331
  background: var(--bg-tertiary);
332
  }
 
3708
  <body>
3709
  <div class="controls">
3710
  <div class="controls-buttons">
3711
+
3712
+ <a href="index.html" class="back-button">← back</a>
3713
+
3714
  <div class="theme-toggle" onclick="toggleTheme()">light</div>
3715
  <div class="reset-toggle" onclick="resetLayout()">reset</div>
3716
  <div class="menu-button" onclick="toggleMenu()">
 
3759
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3760
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3761
  </span> |
3762
+ Cell: benchmark | 42.08s
3763
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3764
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3765
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3852
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3853
  <div class="uv-logs-content" style="display: none;">
3854
  Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
3855
+ Downloading numpy (16.2MiB)
3856
+ Downloading nvidia-cufft-cu12 (184.2MiB)
3857
+ Downloading networkx (1.9MiB)
 
 
3858
  Downloading nvidia-curand-cu12 (60.7MiB)
3859
+ Downloading nvidia-cusolver-cu12 (255.1MiB)
3860
  Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
 
3861
  Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3862
+ Downloading nvidia-cusparselt-cu12 (273.9MiB)
 
3863
  Downloading setuptools (1.1MiB)
 
 
 
 
 
3864
  Downloading torch (846.9MiB)
3865
  Downloading matplotlib (8.3MiB)
 
 
3866
  Downloading fonttools (4.7MiB)
3867
+ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
3868
+ Downloading nvidia-cudnn-cu12 (674.0MiB)
3869
+ Downloading nvidia-cufile-cu12 (1.1MiB)
3870
+ Downloading kiwisolver (1.4MiB)
3871
+ Downloading nvidia-cublas-cu12 (566.8MiB)
3872
  Downloading xformers (111.8MiB)
3873
+ Downloading nvidia-nccl-cu12 (307.4MiB)
3874
+ Downloading pillow (6.3MiB)
3875
+ Downloading sympy (6.0MiB)
3876
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
3877
+ Downloading triton (148.3MiB)
3878
  Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3879
  Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3880
  Downloading nvidia-cufile-cu12
3881
  Downloading kiwisolver
3882
  Downloading setuptools
 
3883
  Downloading networkx
3884
+ Downloading fonttools
3885
  Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3886
+ Downloading pillow
3887
  Downloading nvidia-cuda-cupti-cu12
3888
  Downloading matplotlib
3889
  Downloading sympy
 
3898
  Downloading nvidia-cusparselt-cu12
3899
  Downloading nvidia-cusparse-cu12
3900
  Downloading nvidia-nccl-cu12
 
3901
  Downloading nvidia-cudnn-cu12
3902
+ Downloading nvidia-cublas-cu12
3903
  Downloading torch
3904
+ Installed 38 packages in 541ms
3905
  </div>
3906
  </div>
3907
  <div class="cell-artifacts">
flash_attn/index.html CHANGED
@@ -2,22 +2,86 @@
2
  <html>
3
  <head>
4
  <meta charset='UTF-8'>
5
- <title>Directory Index</title>
 
6
  <style>
7
- body { font-family: monospace; margin: 20px; }
8
- h1 { font-size: 1.5em; }
9
- ul { list-style-type: none; padding-left: 20px; }
10
- li { margin: 5px 0; }
11
- .dir { font-weight: bold; }
12
- .file { color: #0066cc; }
13
- a { text-decoration: none; }
14
- a:hover { text-decoration: underline; }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  </style>
16
  </head>
17
  <body>
 
 
 
18
  <h1>Index of /flash_attn</h1>
19
  <ul>
20
- <li><a href='../index.html' class='dir'>../</a></li>
21
  <li><a href='impls/index.html' class='dir'>impls/</a></li>
22
  <li><a href='results/index.html' class='dir'>results/</a></li>
23
  </ul>
 
2
  <html>
3
  <head>
4
  <meta charset='UTF-8'>
5
+ <meta name='viewport' content='width=device-width, initial-scale=1.0'>
6
+ <title>Index of /flash_attn</title>
7
  <style>
8
+ :root {
9
+ --bg-primary: #0a0a0a;
10
+ --bg-secondary: #121212;
11
+ --bg-tertiary: #181818;
12
+ --text-primary: #e0e0e0;
13
+ --text-secondary: #888888;
14
+ --text-link: #64b5f6;
15
+ --border-primary: #2a2a2a;
16
+ }
17
+ body {
18
+ font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
19
+ background: var(--bg-primary);
20
+ color: var(--text-primary);
21
+ margin: 0;
22
+ padding: 16px;
23
+ max-width: 900px;
24
+ margin: 0 auto;
25
+ }
26
+ .controls {
27
+ display: flex;
28
+ justify-content: flex-end;
29
+ margin-bottom: 1rem;
30
+ }
31
+ .back-button {
32
+ background: var(--bg-secondary);
33
+ border: 1px solid var(--border-primary);
34
+ padding: 8px 12px;
35
+ border-radius: 4px;
36
+ color: var(--text-secondary);
37
+ cursor: pointer;
38
+ font-size: 0.9rem;
39
+ text-decoration: none;
40
+ display: inline-block;
41
+ }
42
+ .back-button:hover {
43
+ color: var(--text-primary);
44
+ background: var(--bg-tertiary);
45
+ }
46
+ h1 {
47
+ font-size: 1.5em;
48
+ margin: 1rem 0;
49
+ color: var(--text-primary);
50
+ border-bottom: 1px solid var(--border-primary);
51
+ padding-bottom: 0.5rem;
52
+ }
53
+ ul {
54
+ list-style-type: none;
55
+ padding: 0;
56
+ }
57
+ li {
58
+ margin: 0;
59
+ border-bottom: 1px solid var(--border-primary);
60
+ }
61
+ li:last-child {
62
+ border-bottom: none;
63
+ }
64
+ a {
65
+ display: block;
66
+ padding: 0.75rem 0.5rem;
67
+ text-decoration: none;
68
+ color: var(--text-link);
69
+ transition: background 0.2s ease;
70
+ }
71
+ a:hover {
72
+ background: var(--bg-secondary);
73
+ }
74
+ .dir {
75
+ font-weight: 500;
76
+ }
77
  </style>
78
  </head>
79
  <body>
80
+ <div class='controls'>
81
+ <a href='../index.html' class='back-button'>← back</a>
82
+ </div>
83
  <h1>Index of /flash_attn</h1>
84
  <ul>
 
85
  <li><a href='impls/index.html' class='dir'>impls/</a></li>
86
  <li><a href='results/index.html' class='dir'>results/</a></li>
87
  </ul>
flash_attn/results/artifacts/combine/latency.csv CHANGED
@@ -1,43 +1,43 @@
1
  Implementation,Impl ID,Workload,Batch,Seq Length,Heads,Head Dim,Dtype,Mean (ms),P10 (ms),P50 (ms),P90 (ms),Reps,Peak Mem (MB),Backend,Family
2
- Flash (PyTorch SDPA),torch_flash_ma,flux_L128,1,1152,24,128,bfloat16,0.48577280044555665,0.47836801409721375,0.4803520143032074,0.4827199876308441,5,83.38,FLASH,torch-sdpa
3
- Flash (PyTorch SDPA),torch_flash_ma,flux_L256,1,1280,24,128,bfloat16,0.5229184031486511,0.521727979183197,0.5228800177574158,0.5234559774398804,5,90.62,FLASH,torch-sdpa
4
- Flash (PyTorch SDPA),torch_flash_ma,flux_L320,1,1344,24,128,bfloat16,0.6515967845916748,0.6503999829292297,0.650879979133606,0.6513599753379822,5,95.06,FLASH,torch-sdpa
5
- Flash (PyTorch SDPA),torch_flash_ma,flux_L384,1,1408,24,128,bfloat16,0.6807615995407105,0.6797440052032471,0.6808639764785767,0.6815680265426636,5,99.88,FLASH,torch-sdpa
6
- Flash (PyTorch SDPA),torch_flash_ma,flux_L448,1,1472,24,128,bfloat16,0.711027193069458,0.7058879733085632,0.7121919989585876,0.7131519913673401,5,103.81,FLASH,torch-sdpa
7
- Flash (PyTorch SDPA),torch_flash_ma,flux_L512,1,1536,24,128,bfloat16,0.7391423940658569,0.7369279861450195,0.7383999824523926,0.7408959865570068,5,109.12,FLASH,torch-sdpa
8
- MemEff (PyTorch SDPA),torch_mem_eff,flux_L128,1,1152,24,128,bfloat16,0.5875647902488709,0.5863680243492126,0.5874559879302979,0.5876479744911194,5,83.38,EFFICIENT,torch-sdpa
9
- MemEff (PyTorch SDPA),torch_mem_eff,flux_L256,1,1280,24,128,bfloat16,0.653657603263855,0.6485440135002136,0.6537600159645081,0.656544029712677,5,90.62,EFFICIENT,torch-sdpa
10
- MemEff (PyTorch SDPA),torch_mem_eff,flux_L320,1,1344,24,128,bfloat16,0.7784512042999268,0.774944007396698,0.778656005859375,0.7801600098609924,5,95.94,EFFICIENT,torch-sdpa
11
- MemEff (PyTorch SDPA),torch_mem_eff,flux_L384,1,1408,24,128,bfloat16,0.7922943949699401,0.791263997554779,0.7924799919128418,0.7927039861679077,5,100.0,EFFICIENT,torch-sdpa
12
- MemEff (PyTorch SDPA),torch_mem_eff,flux_L448,1,1472,24,128,bfloat16,0.848089587688446,0.8444799780845642,0.8470079898834229,0.8499199748039246,5,103.81,EFFICIENT,torch-sdpa
13
- MemEff (PyTorch SDPA),torch_mem_eff,flux_L512,1,1536,24,128,bfloat16,0.9523776054382325,0.95004802942276,0.9519039988517761,0.9541119933128357,5,109.12,EFFICIENT,torch-sdpa
14
- xFormers,xformers_meff,flux_L128,1,1152,24,128,bfloat16,0.45066879987716674,0.4474239945411682,0.44921600818634033,0.45241600275039673,5,83.38,memory_efficient,xformers
15
- xFormers,xformers_meff,flux_L256,1,1280,24,128,bfloat16,0.47004159688949587,0.4652479887008667,0.4705919921398163,0.4716799855232239,5,90.62,memory_efficient,xformers
16
- xFormers,xformers_meff,flux_L320,1,1344,24,128,bfloat16,0.6022783994674683,0.5987840294837952,0.6021760106086731,0.6045759916305542,5,95.06,memory_efficient,xformers
17
- xFormers,xformers_meff,flux_L384,1,1408,24,128,bfloat16,0.6013055920600892,0.6000319719314575,0.600383996963501,0.6016640067100525,5,99.88,memory_efficient,xformers
18
- xFormers,xformers_meff,flux_L448,1,1472,24,128,bfloat16,0.6408192038536071,0.639136016368866,0.6404479742050171,0.6416320204734802,5,103.81,memory_efficient,xformers
19
- xFormers,xformers_meff,flux_L512,1,1536,24,128,bfloat16,0.6466111898422241,0.6447359919548035,0.6462399959564209,0.6483839750289917,5,109.12,memory_efficient,xformers
20
- Compiled (default),torch_flash_compiled_default,flux_L128,1,1152,24,128,bfloat16,0.527347207069397,0.5194560289382935,0.5272960066795349,0.5312960147857666,5,83.38,FLASH,torch-sdpa
21
- Compiled (default),torch_flash_compiled_default,flux_L256,1,1280,24,128,bfloat16,0.5586367964744567,0.5560640096664429,0.5571519732475281,0.5611839890480042,5,90.62,FLASH,torch-sdpa
22
- Compiled (default),torch_flash_compiled_default,flux_L320,1,1344,24,128,bfloat16,0.6860736012458801,0.6841920018196106,0.6860160231590271,0.6869760155677795,5,95.25,FLASH,torch-sdpa
23
- Compiled (default),torch_flash_compiled_default,flux_L384,1,1408,24,128,bfloat16,0.7167360067367554,0.7152000069618225,0.7161920070648193,0.7164160013198853,5,99.88,FLASH,torch-sdpa
24
- Compiled (default),torch_flash_compiled_default,flux_L448,1,1472,24,128,bfloat16,0.7423295855522156,0.7400959730148315,0.742143988609314,0.7431039810180664,5,103.81,FLASH,torch-sdpa
25
- Compiled (default),torch_flash_compiled_default,flux_L512,1,1536,24,128,bfloat16,0.7743871927261352,0.7718080282211304,0.7745919823646545,0.7748159766197205,5,109.12,FLASH,torch-sdpa
26
- Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L128,1,1152,24,128,bfloat16,0.6489088058471679,0.6148160099983215,0.6296960115432739,0.6522240042686462,5,67.5,FLASH,torch-sdpa
27
- Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L256,1,1280,24,128,bfloat16,0.700761592388153,0.6615359783172607,0.6821119785308838,0.7128959894180298,5,75.0,FLASH,torch-sdpa
28
- Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L320,1,1344,24,128,bfloat16,0.834444797039032,0.7967039942741394,0.8164799809455872,0.8463680148124695,5,80.38,FLASH,torch-sdpa
29
- Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L384,1,1408,24,128,bfloat16,0.8709375977516174,0.8432319760322571,0.8498560190200806,0.8750079870223999,5,82.5,FLASH,torch-sdpa
30
- Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L448,1,1472,24,128,bfloat16,0.9069631934165955,0.8775359988212585,0.9030719995498657,0.903872013092041,5,86.25,FLASH,torch-sdpa
31
- Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L512,1,1536,24,128,bfloat16,0.9371584057807922,0.9145920276641846,0.9164159893989563,0.9357439875602722,5,90.0,FLASH,torch-sdpa
32
- HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L128,1,1152,24,128,bfloat16,0.34446719884872434,0.3438720107078552,0.3445119857788086,0.34457600116729736,5,83.38,flash-attn,hf-kernels
33
- HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L256,1,1280,24,128,bfloat16,0.37571839094161985,0.37404799461364746,0.3763839900493622,0.3766399919986725,5,90.62,flash-attn,hf-kernels
34
- HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L320,1,1344,24,128,bfloat16,0.4945920050144196,0.4925439953804016,0.493120014667511,0.4938240051269531,5,95.06,flash-attn,hf-kernels
35
- HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L384,1,1408,24,128,bfloat16,0.5139647841453552,0.5123199820518494,0.5142719745635986,0.5147839784622192,5,99.88,flash-attn,hf-kernels
36
- HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L448,1,1472,24,128,bfloat16,0.5353855967521668,0.5339199900627136,0.5350080132484436,0.5352320075035095,5,103.81,flash-attn,hf-kernels
37
- HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L512,1,1536,24,128,bfloat16,0.5548800110816956,0.5538560152053833,0.5548800230026245,0.5553280115127563,5,109.12,flash-attn,hf-kernels
38
- HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L128,1,1152,24,128,bfloat16,0.3617343962192535,0.36102399230003357,0.3616960048675537,0.36211198568344116,5,83.38,flash-attn3,hf-kernels
39
- HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L256,1,1280,24,128,bfloat16,0.3907967984676361,0.3885439932346344,0.39056000113487244,0.3906239867210388,5,90.62,flash-attn3,hf-kernels
40
- HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L320,1,1344,24,128,bfloat16,0.5228991985321045,0.521344006061554,0.5230720043182373,0.5232319831848145,5,95.06,flash-attn3,hf-kernels
41
- HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L384,1,1408,24,128,bfloat16,0.5254656076431274,0.523904025554657,0.5249919891357422,0.526528000831604,5,99.88,flash-attn3,hf-kernels
42
- HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L448,1,1472,24,128,bfloat16,0.5646592020988465,0.5627840161323547,0.565343976020813,0.565343976020813,5,103.81,flash-attn3,hf-kernels
43
- HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L512,1,1536,24,128,bfloat16,0.5698879957199097,0.567359983921051,0.5696640014648438,0.5698559880256653,5,109.12,flash-attn3,hf-kernels
 
1
  Implementation,Impl ID,Workload,Batch,Seq Length,Heads,Head Dim,Dtype,Mean (ms),P10 (ms),P50 (ms),P90 (ms),Reps,Peak Mem (MB),Backend,Family
2
+ Flash (PyTorch SDPA),torch_flash_ma,flux_L128,1,1152,24,128,bfloat16,0.49411200881004336,0.48844799399375916,0.4936000108718872,0.4944640100002289,5,83.38,FLASH,torch-sdpa
3
+ Flash (PyTorch SDPA),torch_flash_ma,flux_L256,1,1280,24,128,bfloat16,0.5234112024307251,0.5224320292472839,0.5235199928283691,0.5235840082168579,5,90.62,FLASH,torch-sdpa
4
+ Flash (PyTorch SDPA),torch_flash_ma,flux_L320,1,1344,24,128,bfloat16,0.6527232170104981,0.6503040194511414,0.6524800062179565,0.6545600295066833,5,95.06,FLASH,torch-sdpa
5
+ Flash (PyTorch SDPA),torch_flash_ma,flux_L384,1,1408,24,128,bfloat16,0.682803213596344,0.6805760264396667,0.6828799843788147,0.6832640171051025,5,99.88,FLASH,torch-sdpa
6
+ Flash (PyTorch SDPA),torch_flash_ma,flux_L448,1,1472,24,128,bfloat16,0.7075456142425537,0.7057600021362305,0.7063360214233398,0.7070720195770264,5,103.81,FLASH,torch-sdpa
7
+ Flash (PyTorch SDPA),torch_flash_ma,flux_L512,1,1536,24,128,bfloat16,0.7379711985588073,0.7368639707565308,0.7372480034828186,0.7391039729118347,5,109.12,FLASH,torch-sdpa
8
+ MemEff (PyTorch SDPA),torch_mem_eff,flux_L128,1,1152,24,128,bfloat16,0.5874239921569824,0.5861759781837463,0.5873280167579651,0.5877439975738525,5,83.38,EFFICIENT,torch-sdpa
9
+ MemEff (PyTorch SDPA),torch_mem_eff,flux_L256,1,1280,24,128,bfloat16,0.6502719998359681,0.6490240097045898,0.649183988571167,0.6517760157585144,5,90.62,EFFICIENT,torch-sdpa
10
+ MemEff (PyTorch SDPA),torch_mem_eff,flux_L320,1,1344,24,128,bfloat16,0.7812095880508423,0.7761600017547607,0.7803199887275696,0.7852799892425537,5,95.94,EFFICIENT,torch-sdpa
11
+ MemEff (PyTorch SDPA),torch_mem_eff,flux_L384,1,1408,24,128,bfloat16,0.7948480010032654,0.7911999821662903,0.7935360074043274,0.7948480248451233,5,100.0,EFFICIENT,torch-sdpa
12
+ MemEff (PyTorch SDPA),torch_mem_eff,flux_L448,1,1472,24,128,bfloat16,0.8463295936584473,0.8449919819831848,0.8459839820861816,0.8461120128631592,5,103.81,EFFICIENT,torch-sdpa
13
+ MemEff (PyTorch SDPA),torch_mem_eff,flux_L512,1,1536,24,128,bfloat16,0.9538687944412232,0.9492800235748291,0.9518399834632874,0.9581760168075562,5,109.12,EFFICIENT,torch-sdpa
14
+ xFormers,xformers_meff,flux_L128,1,1152,24,128,bfloat16,0.4515071928501129,0.44364801049232483,0.4524799883365631,0.4557119905948639,5,83.38,memory_efficient,xformers
15
+ xFormers,xformers_meff,flux_L256,1,1280,24,128,bfloat16,0.46787199974060056,0.46489599347114563,0.4684160053730011,0.46908798813819885,5,90.62,memory_efficient,xformers
16
+ xFormers,xformers_meff,flux_L320,1,1344,24,128,bfloat16,0.6001471996307373,0.596992015838623,0.5984640121459961,0.6016640067100525,5,95.06,memory_efficient,xformers
17
+ xFormers,xformers_meff,flux_L384,1,1408,24,128,bfloat16,0.6023231983184815,0.5997440218925476,0.6031039953231812,0.6032639741897583,5,99.88,memory_efficient,xformers
18
+ xFormers,xformers_meff,flux_L448,1,1472,24,128,bfloat16,0.6411136031150818,0.6381760239601135,0.6414719820022583,0.6421440243721008,5,103.81,memory_efficient,xformers
19
+ xFormers,xformers_meff,flux_L512,1,1536,24,128,bfloat16,0.6594688057899475,0.6441280245780945,0.6496639847755432,0.6527680158615112,5,109.12,memory_efficient,xformers
20
+ Compiled (default),torch_flash_compiled_default,flux_L128,1,1152,24,128,bfloat16,0.5181439876556396,0.5141760110855103,0.5175679922103882,0.5197759866714478,5,83.38,FLASH,torch-sdpa
21
+ Compiled (default),torch_flash_compiled_default,flux_L256,1,1280,24,128,bfloat16,0.5579584002494812,0.5549119710922241,0.5582720041275024,0.5598080158233643,5,90.62,FLASH,torch-sdpa
22
+ Compiled (default),torch_flash_compiled_default,flux_L320,1,1344,24,128,bfloat16,0.6872959971427918,0.6853119730949402,0.687391996383667,0.6883519887924194,5,95.25,FLASH,torch-sdpa
23
+ Compiled (default),torch_flash_compiled_default,flux_L384,1,1408,24,128,bfloat16,0.716153597831726,0.7128639817237854,0.7160959839820862,0.7167680263519287,5,99.88,FLASH,torch-sdpa
24
+ Compiled (default),torch_flash_compiled_default,flux_L448,1,1472,24,128,bfloat16,0.7418303966522217,0.7386879920959473,0.7400959730148315,0.7415040135383606,5,103.81,FLASH,torch-sdpa
25
+ Compiled (default),torch_flash_compiled_default,flux_L512,1,1536,24,128,bfloat16,0.7745471954345703,0.7708160281181335,0.7740799784660339,0.7753919959068298,5,109.12,FLASH,torch-sdpa
26
+ Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L128,1,1152,24,128,bfloat16,0.6468096017837525,0.6144000291824341,0.6245759725570679,0.6483200192451477,5,67.5,FLASH,torch-sdpa
27
+ Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L256,1,1280,24,128,bfloat16,0.7060160160064697,0.6689280271530151,0.6851199865341187,0.7184960246086121,5,75.0,FLASH,torch-sdpa
28
+ Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L320,1,1344,24,128,bfloat16,0.8332608103752136,0.7953600287437439,0.8155840039253235,0.8403519988059998,5,80.38,FLASH,torch-sdpa
29
+ Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L384,1,1408,24,128,bfloat16,0.8719295978546142,0.8470720052719116,0.849727988243103,0.8745279908180237,5,82.5,FLASH,torch-sdpa
30
+ Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L448,1,1472,24,128,bfloat16,0.9034304022789001,0.8677120208740234,0.8835520148277283,0.9034240245819092,5,86.25,FLASH,torch-sdpa
31
+ Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L512,1,1536,24,128,bfloat16,0.9387519836425782,0.9154239892959595,0.9213759899139404,0.9359679818153381,5,90.0,FLASH,torch-sdpa
32
+ HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L128,1,1152,24,128,bfloat16,0.3455295979976654,0.34355199337005615,0.34563198685646057,0.34643200039863586,5,83.38,flash-attn,hf-kernels
33
+ HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L256,1,1280,24,128,bfloat16,0.3756160080432892,0.37411201000213623,0.3752000033855438,0.3770880103111267,5,90.62,flash-attn,hf-kernels
34
+ HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L320,1,1344,24,128,bfloat16,0.4953216016292572,0.49324798583984375,0.49433600902557373,0.49663999676704407,5,95.06,flash-attn,hf-kernels
35
+ HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L384,1,1408,24,128,bfloat16,0.5157055854797363,0.5142719745635986,0.516319990158081,0.516543984413147,5,99.88,flash-attn,hf-kernels
36
+ HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L448,1,1472,24,128,bfloat16,0.5356672048568726,0.5346879959106445,0.5358080267906189,0.5361599922180176,5,103.81,flash-attn,hf-kernels
37
+ HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L512,1,1536,24,128,bfloat16,0.5587136030197144,0.5557760000228882,0.5574079751968384,0.5581120252609253,5,109.12,flash-attn,hf-kernels
38
+ HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L128,1,1152,24,128,bfloat16,0.3619711995124817,0.3603839874267578,0.361952006816864,0.3624640107154846,5,83.38,flash-attn3,hf-kernels
39
+ HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L256,1,1280,24,128,bfloat16,0.3912447988986969,0.3892799913883209,0.3909760117530823,0.3922559916973114,5,90.62,flash-attn3,hf-kernels
40
+ HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L320,1,1344,24,128,bfloat16,0.5258048176765442,0.5240640044212341,0.5248960256576538,0.5248960256576538,5,95.06,flash-attn3,hf-kernels
41
+ HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L384,1,1408,24,128,bfloat16,0.5276032090187073,0.5265600085258484,0.5277760028839111,0.5282559990882874,5,99.88,flash-attn3,hf-kernels
42
+ HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L448,1,1472,24,128,bfloat16,0.5656383991241455,0.5639039874076843,0.5657920241355896,0.5668479800224304,5,103.81,flash-attn3,hf-kernels
43
+ HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L512,1,1536,24,128,bfloat16,0.5789952039718628,0.5689600110054016,0.5698239803314209,0.5713919997215271,5,109.12,flash-attn3,hf-kernels
flash_attn/results/artifacts/combine/latency.svg CHANGED

Git LFS Details

  • SHA256: 587d477bc9dc161a51c012142295b5a6efa33e48054fc657106ca27ba64b8683
  • Pointer size: 130 Bytes
  • Size of remote file: 28.3 kB

Git LFS Details

  • SHA256: 2c1da56080e7fd1a85c14295083b11d6bac981f6fb3faef98b0753eb2c1676c7
  • Pointer size: 130 Bytes
  • Size of remote file: 28.2 kB
flash_attn/results/combined_results.html CHANGED
The diff for this file is too large to render. See raw diff
 
flash_attn/results/index.html CHANGED
@@ -2,22 +2,86 @@
2
  <html>
3
  <head>
4
  <meta charset='UTF-8'>
5
- <title>Directory Index</title>
 
6
  <style>
7
- body { font-family: monospace; margin: 20px; }
8
- h1 { font-size: 1.5em; }
9
- ul { list-style-type: none; padding-left: 20px; }
10
- li { margin: 5px 0; }
11
- .dir { font-weight: bold; }
12
- .file { color: #0066cc; }
13
- a { text-decoration: none; }
14
- a:hover { text-decoration: underline; }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  </style>
16
  </head>
17
  <body>
 
 
 
18
  <h1>Index of /flash_attn/results</h1>
19
  <ul>
20
- <li><a href='../index.html' class='dir'>../</a></li>
21
  <li><a href='combined_results.html' class='file'>combined_results.html</a></li>
22
  </ul>
23
  </body>
 
2
  <html>
3
  <head>
4
  <meta charset='UTF-8'>
5
+ <meta name='viewport' content='width=device-width, initial-scale=1.0'>
6
+ <title>Index of /flash_attn/results</title>
7
  <style>
8
+ :root {
9
+ --bg-primary: #0a0a0a;
10
+ --bg-secondary: #121212;
11
+ --bg-tertiary: #181818;
12
+ --text-primary: #e0e0e0;
13
+ --text-secondary: #888888;
14
+ --text-link: #64b5f6;
15
+ --border-primary: #2a2a2a;
16
+ }
17
+ body {
18
+ font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
19
+ background: var(--bg-primary);
20
+ color: var(--text-primary);
21
+ margin: 0;
22
+ padding: 16px;
23
+ max-width: 900px;
24
+ margin: 0 auto;
25
+ }
26
+ .controls {
27
+ display: flex;
28
+ justify-content: flex-end;
29
+ margin-bottom: 1rem;
30
+ }
31
+ .back-button {
32
+ background: var(--bg-secondary);
33
+ border: 1px solid var(--border-primary);
34
+ padding: 8px 12px;
35
+ border-radius: 4px;
36
+ color: var(--text-secondary);
37
+ cursor: pointer;
38
+ font-size: 0.9rem;
39
+ text-decoration: none;
40
+ display: inline-block;
41
+ }
42
+ .back-button:hover {
43
+ color: var(--text-primary);
44
+ background: var(--bg-tertiary);
45
+ }
46
+ h1 {
47
+ font-size: 1.5em;
48
+ margin: 1rem 0;
49
+ color: var(--text-primary);
50
+ border-bottom: 1px solid var(--border-primary);
51
+ padding-bottom: 0.5rem;
52
+ }
53
+ ul {
54
+ list-style-type: none;
55
+ padding: 0;
56
+ }
57
+ li {
58
+ margin: 0;
59
+ border-bottom: 1px solid var(--border-primary);
60
+ }
61
+ li:last-child {
62
+ border-bottom: none;
63
+ }
64
+ a {
65
+ display: block;
66
+ padding: 0.75rem 0.5rem;
67
+ text-decoration: none;
68
+ color: var(--text-link);
69
+ transition: background 0.2s ease;
70
+ }
71
+ a:hover {
72
+ background: var(--bg-secondary);
73
+ }
74
+ .dir {
75
+ font-weight: 500;
76
+ }
77
  </style>
78
  </head>
79
  <body>
80
+ <div class='controls'>
81
+ <a href='../index.html' class='back-button'>← back</a>
82
+ </div>
83
  <h1>Index of /flash_attn/results</h1>
84
  <ul>
 
85
  <li><a href='combined_results.html' class='file'>combined_results.html</a></li>
86
  </ul>
87
  </body>
index.html CHANGED
@@ -2,16 +2,78 @@
2
  <html>
3
  <head>
4
  <meta charset='UTF-8'>
5
- <title>Directory Index</title>
 
6
  <style>
7
- body { font-family: monospace; margin: 20px; }
8
- h1 { font-size: 1.5em; }
9
- ul { list-style-type: none; padding-left: 20px; }
10
- li { margin: 5px 0; }
11
- .dir { font-weight: bold; }
12
- .file { color: #0066cc; }
13
- a { text-decoration: none; }
14
- a:hover { text-decoration: underline; }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  </style>
16
  </head>
17
  <body>
 
2
  <html>
3
  <head>
4
  <meta charset='UTF-8'>
5
+ <meta name='viewport' content='width=device-width, initial-scale=1.0'>
6
+ <title>Index of /</title>
7
  <style>
8
+ :root {
9
+ --bg-primary: #0a0a0a;
10
+ --bg-secondary: #121212;
11
+ --bg-tertiary: #181818;
12
+ --text-primary: #e0e0e0;
13
+ --text-secondary: #888888;
14
+ --text-link: #64b5f6;
15
+ --border-primary: #2a2a2a;
16
+ }
17
+ body {
18
+ font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
19
+ background: var(--bg-primary);
20
+ color: var(--text-primary);
21
+ margin: 0;
22
+ padding: 16px;
23
+ max-width: 900px;
24
+ margin: 0 auto;
25
+ }
26
+ .controls {
27
+ display: flex;
28
+ justify-content: flex-end;
29
+ margin-bottom: 1rem;
30
+ }
31
+ .back-button {
32
+ background: var(--bg-secondary);
33
+ border: 1px solid var(--border-primary);
34
+ padding: 8px 12px;
35
+ border-radius: 4px;
36
+ color: var(--text-secondary);
37
+ cursor: pointer;
38
+ font-size: 0.9rem;
39
+ text-decoration: none;
40
+ display: inline-block;
41
+ }
42
+ .back-button:hover {
43
+ color: var(--text-primary);
44
+ background: var(--bg-tertiary);
45
+ }
46
+ h1 {
47
+ font-size: 1.5em;
48
+ margin: 1rem 0;
49
+ color: var(--text-primary);
50
+ border-bottom: 1px solid var(--border-primary);
51
+ padding-bottom: 0.5rem;
52
+ }
53
+ ul {
54
+ list-style-type: none;
55
+ padding: 0;
56
+ }
57
+ li {
58
+ margin: 0;
59
+ border-bottom: 1px solid var(--border-primary);
60
+ }
61
+ li:last-child {
62
+ border-bottom: none;
63
+ }
64
+ a {
65
+ display: block;
66
+ padding: 0.75rem 0.5rem;
67
+ text-decoration: none;
68
+ color: var(--text-link);
69
+ transition: background 0.2s ease;
70
+ }
71
+ a:hover {
72
+ background: var(--bg-secondary);
73
+ }
74
+ .dir {
75
+ font-weight: 500;
76
+ }
77
  </style>
78
  </head>
79
  <body>